diff --git a/dataSources/PubMed/antoine_ex.py b/dataSources/PubMed/antoine_ex.py new file mode 100644 index 0000000000000000000000000000000000000000..b0ed71399cc83b7cfe9fbaab449cca3486bba1bd --- /dev/null +++ b/dataSources/PubMed/antoine_ex.py @@ -0,0 +1,214 @@ +import sys +import os +import urllib.parse +from requests import get +import xmltodict +from datetime import datetime, timedelta, date +import json + +# ------------------- +# Imports for variable file +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +from variables.pubmed import * +# ------------------- + +def getPubmedData(term, date_min, date_max): + + print("--------------------") + print(f"Term: {term}") + print("--------------------") + print("--------------------") + print(f"Date min: {date_min}") + print(f"Date max: {date_max}") + print("--------------------") + + # Search : + # db: pubmed + # term : search term + # retmode : return data in json format + # mindate : start date + # maxdate : end date + # usehistory : + url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y' + + while(True): + try: + response = get(url) + break + except Exception as e: + print(e) + + print(response) + + search_res = response.json() + + query_key = search_res["esearchresult"]["querykey"] + webenv = search_res["esearchresult"]["webenv"] + + print("--------------------") + print(f"Query key: {query_key}") + print(f"Web env: {webenv}") + print("--------------------") + + # Fetch article data from articles PMIDs that we got from the search + url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}" + + while(True): + try: + response = get(url) + break + except Exception as e: + print(e) + + obj = xmltodict.parse(response.text) + if "PubmedArticleSet" not in obj: + return [] + + obj = obj["PubmedArticleSet"] + + data_list = [] + + for key in obj: + if isinstance(obj[key], list): + for entrie in obj[key]: + if "MedlineCitation" in entrie: + + if "MeshHeadingList" in entrie["MedlineCitation"]: + data = {} + + # PMID + data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"] + + # Journal title + data["Title"] = "" + if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list): + for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]: + if "#text" in part: + data["Title"] += part["#text"] + elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str): + data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"] + else: + data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] + + # Article title + data["ArticleTitle"] = "" + if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list): + for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]: + if "#text" in part: + data["ArticleTitle"] += part["#text"] + elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str): + data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"] + else: + data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] + + # Abstarct + data["Abstract"] = "" + if "Abstract" in entrie["MedlineCitation"]["Article"] : + if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list): + for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]: + if "#text" in part: + data["Abstract"] += part["#text"] + elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str): + data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"] + else: + data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] + + # MeSH terms + data["MeshTerms"] = [] + if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list): + for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]: + data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"]) + else: + data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"]) + + # PubMed publication date + for date in entrie["PubmedData"]["History"]["PubMedPubDate"]: + if date["@PubStatus"] == "pubmed": + data["Date"] = { + "Year": date["Year"], + "Month": date["Month"], + "Day": date["Day"] + } + break + + data_list.append(data) + + return data_list + +def url_encode(text): + return urllib.parse.quote_plus(text, safe='[]():"') + +def get_mesh_terms(terms): + res = [] + + for item in terms.values(): + if isinstance(item, list): + tmp = [f'"{i}"[Mesh:noexp]' for i in item] + res.append("( " + " AND ".join(tmp) + " )") + else: + res.append(f'"{item}"[Mesh:noexp]') + + return res + +def get_subheadings(terms): + res = [] + + for item in terms.values(): + if isinstance(item, list): + tmp = [f'"{i}"[SubHeading:noexp]' for i in item] + res.append("( " + " AND ".join(tmp) + " )") + else: + res.append(f'"{item}"[SubHeading:noexp]') + + return res + +def main(): + + ncds_mesh_terms = get_mesh_terms(NCDS_MESH_TERM) + keywords_mesh_terms = get_mesh_terms(KEYWORDS_MESH_TERM) + keywords_site_proposition_mesh_terms = get_mesh_terms(KEYWORDS_MESH_SITE_PROPOSITION) + keywords_proposition_mesh_terms = get_mesh_terms(KEYWORDS_MESH_PROPOSITION) + keywords_subheadings = get_subheadings(KEYWORDS_MESH_SUBHEADING) + + keywords_groups = { + "exact match" : keywords_mesh_terms, + "site proposition" : keywords_site_proposition_mesh_terms, + "personal proposition" : keywords_proposition_mesh_terms, + "subheading" : keywords_subheadings + } + + stored_articles = [] + stored_pmids = [] + + for ncd_mesh_term in ncds_mesh_terms: + print(f"NCD : {ncd_mesh_term}") + + for keywords in keywords_groups.values(): + search_term = f"{ncd_mesh_term} AND ( {" OR ".join(keywords)} )" + + print(f"Search term : {search_term}") + + end_date = date.today() + start_date = end_date - timedelta(days=1) + + articles = getPubmedData(url_encode(search_term), start_date, end_date) + + for article in articles: + + if article["PMID"] in stored_pmids: + continue + + stored_pmids.append(article["PMID"]) + + article_mesh_terms = [mesh.lower() for mesh in atricle["MeshTerms"]] + + article["NCDs"] = [] + for ncd, ncd_mesh in NCDS_MESH_TERM: + if ncd_mesh.lower() in article_mesh_terms: + article["NCDs"].append(ncd) + + stored_articles.append(article) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/testModel/show_graph.py b/testModel/show_graph.py index 2d3853dc20327c1d48ba64cda502c471e76c7042..d54356edf4d8a67b6a366651a57f69b4b7803da9 100644 --- a/testModel/show_graph.py +++ b/testModel/show_graph.py @@ -37,7 +37,7 @@ models = [ # 'gemini-hosted', # 'cohere-hosted', 'facebook/bart-large-mnli', - 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli', + # 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli', 'MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33', # 'MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli', ] @@ -117,6 +117,8 @@ for length_category in length_categories: name += " les textes longs" elif length_category == "VERY LONG": name += " les textes très longs" + + name = "" ax.set_title(name) ax.legend()