From 9676ac126db0c37bb402a91e312e9fc9a9627ec6 Mon Sep 17 00:00:00 2001
From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch>
Date: Fri, 11 Apr 2025 13:09:27 +0200
Subject: [PATCH] Script for antoine. Missing categories identification in
 article (Access, care/management, policy, etc...)

---
 dataSources/PubMed/antoine_ex.py | 214 +++++++++++++++++++++++++++++++
 testModel/show_graph.py          |   4 +-
 2 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 dataSources/PubMed/antoine_ex.py

diff --git a/dataSources/PubMed/antoine_ex.py b/dataSources/PubMed/antoine_ex.py
new file mode 100644
index 00000000..b0ed7139
--- /dev/null
+++ b/dataSources/PubMed/antoine_ex.py
@@ -0,0 +1,214 @@
+import sys
+import os
+import urllib.parse
+from requests import get
+import xmltodict
+from datetime import datetime, timedelta, date
+import json
+
+# -------------------
+# Imports for variable file
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+
+from variables.pubmed import *
+# -------------------
+
+def getPubmedData(term, date_min, date_max):
+
+    print("--------------------")
+    print(f"Term: {term}")
+    print("--------------------")
+    print("--------------------")
+    print(f"Date min: {date_min}")
+    print(f"Date max: {date_max}")
+    print("--------------------")
+
+    # Search : 
+    #   db: pubmed
+    #   term : search term
+    #   retmode : return data in json format
+    #   mindate : start date
+    #   maxdate : end date
+    #   usehistory : 
+    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
+
+    while(True):
+        try:  
+            response = get(url)
+            break
+        except Exception as e:
+            print(e)
+
+    print(response)
+
+    search_res = response.json()
+
+    query_key = search_res["esearchresult"]["querykey"]
+    webenv = search_res["esearchresult"]["webenv"]
+
+    print("--------------------")
+    print(f"Query key: {query_key}")
+    print(f"Web env: {webenv}")
+    print("--------------------")
+
+    # Fetch article data from articles PMIDs that we got from the search
+    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}"
+
+    while(True):
+        try:
+            response = get(url)
+            break
+        except Exception as e:
+            print(e)
+
+    obj = xmltodict.parse(response.text)
+    if "PubmedArticleSet" not in obj:
+        return []
+        
+    obj = obj["PubmedArticleSet"]
+
+    data_list = []
+
+    for key in obj:
+        if isinstance(obj[key], list):
+            for entrie in obj[key]:
+                if "MedlineCitation" in entrie:
+
+                    if "MeshHeadingList" in entrie["MedlineCitation"]:
+                        data = {}
+
+                        # PMID
+                        data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"]
+
+                        # Journal title
+                        data["Title"] = ""
+                        if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list):
+                            for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
+                                if "#text" in part:
+                                    data["Title"] += part["#text"]
+                        elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str):
+                            data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"]
+                        else:
+                            data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
+                        
+                        # Article title
+                        data["ArticleTitle"] = ""
+                        if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list):
+                            for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
+                                if "#text" in part:
+                                    data["ArticleTitle"] += part["#text"]
+                        elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str):
+                            data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"]
+                        else:
+                            data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
+                        
+                        # Abstarct
+                        data["Abstract"] = ""
+                        if "Abstract" in entrie["MedlineCitation"]["Article"] :
+                            if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list):
+                                for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
+                                    if "#text" in part:
+                                        data["Abstract"] += part["#text"]
+                            elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str):
+                                data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"]
+                            else:
+                                data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
+
+                        # MeSH terms
+                        data["MeshTerms"] = []
+                        if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list):
+                            for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]:
+                                data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"])
+                        else:
+                            data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"])
+
+                        # PubMed publication date
+                        for date in entrie["PubmedData"]["History"]["PubMedPubDate"]:
+                            if date["@PubStatus"] == "pubmed":
+                                data["Date"] = {
+                                    "Year": date["Year"],
+                                    "Month": date["Month"],
+                                    "Day": date["Day"]
+                                }
+                                break
+
+                        data_list.append(data)
+    
+        return data_list
+
+def url_encode(text):
+    return urllib.parse.quote_plus(text, safe='[]():"')
+
+def get_mesh_terms(terms):
+    res = []
+
+    for item in terms.values():
+        if isinstance(item, list):
+            tmp = [f'"{i}"[Mesh:noexp]' for i in item]
+            res.append("( " + " AND ".join(tmp) + " )")
+        else:
+            res.append(f'"{item}"[Mesh:noexp]')
+    
+    return res
+
+def get_subheadings(terms):
+    res = []
+
+    for item in terms.values():
+        if isinstance(item, list):
+            tmp = [f'"{i}"[SubHeading:noexp]' for i in item]
+            res.append("( " + " AND ".join(tmp) + " )")
+        else:
+            res.append(f'"{item}"[SubHeading:noexp]')
+    
+    return res
+
+def main():
+
+    ncds_mesh_terms = get_mesh_terms(NCDS_MESH_TERM)
+    keywords_mesh_terms = get_mesh_terms(KEYWORDS_MESH_TERM)
+    keywords_site_proposition_mesh_terms = get_mesh_terms(KEYWORDS_MESH_SITE_PROPOSITION)
+    keywords_proposition_mesh_terms = get_mesh_terms(KEYWORDS_MESH_PROPOSITION)
+    keywords_subheadings = get_subheadings(KEYWORDS_MESH_SUBHEADING)
+
+    keywords_groups = {
+        "exact match" : keywords_mesh_terms,
+        "site proposition" : keywords_site_proposition_mesh_terms,
+        "personal proposition" : keywords_proposition_mesh_terms,
+        "subheading" : keywords_subheadings
+    }
+
+    stored_articles = []
+    stored_pmids = []
+
+    for ncd_mesh_term in ncds_mesh_terms:
+        print(f"NCD : {ncd_mesh_term}")
+
+        for keywords in keywords_groups.values():
+            search_term = f"{ncd_mesh_term} AND ( {" OR ".join(keywords)} )"
+
+            print(f"Search term : {search_term}")
+
+            end_date = date.today()
+            start_date = end_date - timedelta(days=1)
+
+            articles = getPubmedData(url_encode(search_term), start_date, end_date)
+
+            for article in articles:
+
+                if article["PMID"] in stored_pmids:
+                    continue
+
+                stored_pmids.append(article["PMID"])
+
+                article_mesh_terms = [mesh.lower() for mesh in atricle["MeshTerms"]]
+
+                article["NCDs"] = []
+                for ncd, ncd_mesh in NCDS_MESH_TERM:
+                    if ncd_mesh.lower() in article_mesh_terms:
+                        article["NCDs"].append(ncd)
+
+                stored_articles.append(article)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/testModel/show_graph.py b/testModel/show_graph.py
index 2d3853dc..d54356ed 100644
--- a/testModel/show_graph.py
+++ b/testModel/show_graph.py
@@ -37,7 +37,7 @@ models = [
     # 'gemini-hosted',
     # 'cohere-hosted',
     'facebook/bart-large-mnli',
-    'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
+    # 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli',
     'MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33',
     # 'MoritzLaurer/multilingual-MiniLMv2-L6-mnli-xnli',
 ]
@@ -117,6 +117,8 @@ for length_category in length_categories:
         name += " les textes longs"
     elif length_category == "VERY LONG":
         name += " les textes très longs"
+
+    name = ""
     
     ax.set_title(name)
     ax.legend()
-- 
GitLab