Skip to content
Snippets Groups Projects
Commit 9676ac12 authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Script for antoine. Missing categories identification in article (Access,...

Script for antoine. Missing categories identification in article (Access, care/management, policy, etc...)
parent 3f797315
Branches
No related tags found
No related merge requests found
import sys
import os
import urllib.parse
from requests import get
import xmltodict
from datetime import datetime, timedelta, date
import json
# -------------------
# Imports for variable file
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
from variables.pubmed import *
# -------------------
def getPubmedData(term, date_min, date_max):
print("--------------------")
print(f"Term: {term}")
print("--------------------")
print("--------------------")
print(f"Date min: {date_min}")
print(f"Date max: {date_max}")
print("--------------------")
# Search :
# db: pubmed
# term : search term
# retmode : return data in json format
# mindate : start date
# maxdate : end date
# usehistory :
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
while(True):
try:
response = get(url)
break
except Exception as e:
print(e)
print(response)
search_res = response.json()
query_key = search_res["esearchresult"]["querykey"]
webenv = search_res["esearchresult"]["webenv"]
print("--------------------")
print(f"Query key: {query_key}")
print(f"Web env: {webenv}")
print("--------------------")
# Fetch article data from articles PMIDs that we got from the search
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}"
while(True):
try:
response = get(url)
break
except Exception as e:
print(e)
obj = xmltodict.parse(response.text)
if "PubmedArticleSet" not in obj:
return []
obj = obj["PubmedArticleSet"]
data_list = []
for key in obj:
if isinstance(obj[key], list):
for entrie in obj[key]:
if "MedlineCitation" in entrie:
if "MeshHeadingList" in entrie["MedlineCitation"]:
data = {}
# PMID
data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"]
# Journal title
data["Title"] = ""
if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list):
for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
if "#text" in part:
data["Title"] += part["#text"]
elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str):
data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"]
else:
data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
# Article title
data["ArticleTitle"] = ""
if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list):
for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
if "#text" in part:
data["ArticleTitle"] += part["#text"]
elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str):
data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"]
else:
data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
# Abstarct
data["Abstract"] = ""
if "Abstract" in entrie["MedlineCitation"]["Article"] :
if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list):
for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
if "#text" in part:
data["Abstract"] += part["#text"]
elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str):
data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"]
else:
data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
# MeSH terms
data["MeshTerms"] = []
if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list):
for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]:
data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"])
else:
data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"])
# PubMed publication date
for date in entrie["PubmedData"]["History"]["PubMedPubDate"]:
if date["@PubStatus"] == "pubmed":
data["Date"] = {
"Year": date["Year"],
"Month": date["Month"],
"Day": date["Day"]
}
break
data_list.append(data)
return data_list
def url_encode(text):
return urllib.parse.quote_plus(text, safe='[]():"')
def get_mesh_terms(terms):
res = []
for item in terms.values():
if isinstance(item, list):
tmp = [f'"{i}"[Mesh:noexp]' for i in item]
res.append("( " + " AND ".join(tmp) + " )")
else:
res.append(f'"{item}"[Mesh:noexp]')
return res
def get_subheadings(terms):
res = []
for item in terms.values():
if isinstance(item, list):
tmp = [f'"{i}"[SubHeading:noexp]' for i in item]
res.append("( " + " AND ".join(tmp) + " )")
else:
res.append(f'"{item}"[SubHeading:noexp]')
return res
def main():
ncds_mesh_terms = get_mesh_terms(NCDS_MESH_TERM)
keywords_mesh_terms = get_mesh_terms(KEYWORDS_MESH_TERM)
keywords_site_proposition_mesh_terms = get_mesh_terms(KEYWORDS_MESH_SITE_PROPOSITION)
keywords_proposition_mesh_terms = get_mesh_terms(KEYWORDS_MESH_PROPOSITION)
keywords_subheadings = get_subheadings(KEYWORDS_MESH_SUBHEADING)
keywords_groups = {
"exact match" : keywords_mesh_terms,
"site proposition" : keywords_site_proposition_mesh_terms,
"personal proposition" : keywords_proposition_mesh_terms,
"subheading" : keywords_subheadings
}
stored_articles = []
stored_pmids = []
for ncd_mesh_term in ncds_mesh_terms:
print(f"NCD : {ncd_mesh_term}")
for keywords in keywords_groups.values():
search_term = f"{ncd_mesh_term} AND ( {" OR ".join(keywords)} )"
print(f"Search term : {search_term}")
end_date = date.today()
start_date = end_date - timedelta(days=1)
articles = getPubmedData(url_encode(search_term), start_date, end_date)
for article in articles:
if article["PMID"] in stored_pmids:
continue
stored_pmids.append(article["PMID"])
article_mesh_terms = [mesh.lower() for mesh in atricle["MeshTerms"]]
article["NCDs"] = []
for ncd, ncd_mesh in NCDS_MESH_TERM:
if ncd_mesh.lower() in article_mesh_terms:
article["NCDs"].append(ncd)
stored_articles.append(article)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -37,7 +37,7 @@ models = [
# 'gemini-hosted',
# 'cohere-hosted',
'facebook/bart-large-mnli',
'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
# 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
'MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33',
# 'MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli',
]
......@@ -117,6 +117,8 @@ for length_category in length_categories:
name += " les textes longs"
elif length_category == "VERY LONG":
name += " les textes très longs"
name = ""
ax.set_title(name)
ax.legend()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment