From b57f6cef28593feeb9af8c3a72dd188fe2967252 Mon Sep 17 00:00:00 2001
From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch>
Date: Thu, 13 Mar 2025 11:48:51 +0100
Subject: [PATCH] Counting number of published articles from local pubmed save
 (not working to slow)

---
 .../__pycache__/pubmedApi.cpython-313.pyc     | Bin 6559 -> 7064 bytes
 dataSources/PubMed/data_num_locale.py         |  72 ++++++++++++++++++
 dataSources/PubMed/pubmedApi.py               |   6 +-
 testModel/dataset/create_test_data.py         |  34 +--------
 4 files changed, 77 insertions(+), 35 deletions(-)
 create mode 100644 dataSources/PubMed/data_num_locale.py

diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc
index ff19b58d5a7d6f2c548d646fb5d9087fe9ffacd0..8efc23ccdcf2d6565397f9fe5b909684064402a2 100644
GIT binary patch
delta 759
zcmbPlJj0ytGcPX}0|Ns?(xOZ0h0+`OoSB#;xHktgr86;ZnY^3Dld)?vD=RxAb7ZjG
z<c+LSY}O1642fowb6M3Vhp|gd_F;R)>>DgM`5~JKM9N@tF01t90QM%v6=2po_7<?-
z`|L=1r6<2*=Yi@Mn*5HPn>SdFm4SgFR5(~!oq=I<0mmjL{b~k=P{B}Us|W^$L>2}H
z1qOF!-&G*7U`t(I28KLAcV^#S1_o}3%=XEF+}oKRGE6?e?V-4ZfkBZ$hd~46F$M;P
z6h;PyG$sWGc?JcLViv!u&BZ+OtV}iRo2LjmGSz=zV~|r_!FEC4;i87)0nLjVJ|`kC
zYJ^{ih`*?ja9K9tN@nqelBy333^k0tOdlB-N*Vo_J}@x&Getmnksw|aQ!<2?0^+4I
z<v@73OkWrn@|lX5zB4dnF&2XaLDUT$qYK6^7j;|@WL})iAu=N#iyPuDYQ$WYjk%JV
zf1#l40|P?^qbEpvA)^<_25+Vi2rm@G3u8)w@KQncr88xL>`P<J1_^?w$p=Mtv#~KK
zX|8aa>?kUda8W1cvSQAa()te!4313hAR|1OB0;JhnW8|vXo!K1OsODV8dCv8t`Nj4
zVsc>q2-4@s{DpzRnc0o`I|D-*qdQ0tMBOm9oZKKPE^wiw`l4~o0+t0)mlO;h7+Y>`
z5WULC6UxLOt^ARlK|uIh+~h=YBgX8>Q^Y-^Dwr7f6tD0qd|+b`5cyT?!N9<9P=z~0
zp5c%gcPI<PVJVqVP3FtC;-S{eSLC@u?3k};a)($lU$JBjv17i<!WF8_e3g$oRF3(o
h94m;e$rWnBe$|>QjB~TOL_8y>GZUj^Q5*vU0|3cu&L{u?

delta 374
zcmbPXKHr${GcPX}0|NuY${QEc7f5d8b7o?4<Jug|l+MKHJ$W~aC!^wKR#tXKM)%Ez
zY?_RWc9RX+BqvW|Z(>ZCtj{Jr*?^;kQE;*%ht%eH&Z$iLLJSO{f}zY-5ey88EDQ_^
z4DQUnjv%pMOI=<DhCD%cW?vNs25yLq-(+Q;?Ti(ZZ}WIEdowUh_U4shacA)p-0aFL
z&&m|awmD76k%|2a8-tS03b)C!qT*av(lW0rW?fXwx-r>NRC2P3XqO^X`hu~`MIF}z
znHO~eE(FG3)JeFKlzLq;?V@7ZWLdG@EF26bR+}ruuQKukGBHRie`IG65dIc5IZD!q
zF@18cq-RtK69b>(6<&o8YzzV-zluE=7#I$!a0i(&95Uk$mS#9CB@@EHd`Uz+M2Pv4
z8CS3<^JNC^U;*aK0<0jmG*_?{^JP`;U{mJHrmVqM%vTt=LipLQ2yuldY&MXJXXJEb
MVw5b3V_;wa0H5(-F8}}l

diff --git a/dataSources/PubMed/data_num_locale.py b/dataSources/PubMed/data_num_locale.py
new file mode 100644
index 000000000..c08b13da0
--- /dev/null
+++ b/dataSources/PubMed/data_num_locale.py
@@ -0,0 +1,72 @@
+import json
+import sys
+import os
+import statistics
+from datetime import datetime, timedelta
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+
+from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION
+
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
+
+file_path = f"{DATA_DIR}/save_3_years.json"
+
+def match_mesh_terms(article_mesh_terms, ncd, keyword):
+    if ncd in article_mesh_terms:
+        if isinstance(keyword, list):
+            all_in = True
+            for k in keyword:
+                if k not in article_mesh_terms:
+                    all_in = False
+            
+            return all_in
+        else:
+            return keyword in article_mesh_terms
+    else:
+        return False
+
+def filter_articles(articles, start_date, end_date, ncd, keyword):
+    filtered_articles = []
+    
+    for article in articles:
+        article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"]))
+        
+        if start_date <= article_date < end_date:
+
+            mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]]
+            if match_mesh_terms(mesh_terms, ncd, keyword):
+                filtered_articles.append(article)
+
+    return filtered_articles
+
+with open(file_path, "r", encoding="utf-8") as file:
+    data = json.load(file)
+
+print(len(data))
+
+
+ncds_mesh_terms = [mesh_term.lower() for mesh_term in NCDS_MESH_TERM]
+keywords_mesh_terms = [mesh_term.lower() for mesh_term in KEYWORDS_MESH_TERM]
+keywords_subheading_mesh_terms = keywords_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SUBHEADING]
+keywords_site_proposition_mesh_terms = keywords_subheading_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SITE_PROPOSITION]
+keywords_proposition_mesh_terms = keywords_site_proposition_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_PROPOSITION]
+
+for ncd in ncds_mesh_terms:
+    count = []
+
+    for keyword in keywords_mesh_terms:
+        start_date = datetime(2022, 1, 1)
+
+        while(start_date < datetime(2024, 12, 31)):
+            end_date = start_date + timedelta(weeks=1)
+
+            count.append(len(filter_articles(data, start_date, end_date, ncd, keyword)))
+
+            start_date = end_date
+
+    print(f"NCD: {ncd}")
+    print(f"Min: {min(count)}")
+    print(f"Max: {max(count)}")
+    print(f"Mean: {statistics.mean(count)}")
+    print(f"Median: {statistics.median(count)}")
diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py
index bf08b5d9f..d41b83d5f 100644
--- a/dataSources/PubMed/pubmedApi.py
+++ b/dataSources/PubMed/pubmedApi.py
@@ -85,7 +85,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
                             for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
                                 if "#text" in part:
                                     data["Title"] += part["#text"]
-                        elif not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str):
+                        elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str):
                             data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"]
                         else:
                             data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
@@ -95,7 +95,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
                             for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
                                 if "#text" in part:
                                     data["ArticleTitle"] += part["#text"]
-                        elif not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str):
+                        elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str):
                             data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"]
                         else:
                             data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
@@ -106,6 +106,8 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
                                 for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
                                     if "#text" in part:
                                         data["Abstract"] += part["#text"]
+                            elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str):
+                                data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"]
                             else:
                                 data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
                         
diff --git a/testModel/dataset/create_test_data.py b/testModel/dataset/create_test_data.py
index c618a35a2..8d9df36c7 100644
--- a/testModel/dataset/create_test_data.py
+++ b/testModel/dataset/create_test_data.py
@@ -7,39 +7,7 @@ import time
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 
 from folders.dataSources.PubMed.pubmedApi import getPubmedData
-
-LABELS = [
-    "Noncommunicable Diseases",
-    "Diabetes",
-    "Cancer",
-    "Chronic respiratory disease",
-    "Cardiovascular diseases",
-    "Mental Health",
-    "Diabetes type 1",
-    "Diabetes type 2"
-]
-
-MESH = [
-    "Noncommunicable Diseases",
-    "Diabetes Mellitus",
-    "Neoplasms",
-    "Respiratory Tract Diseases",
-    "Cardiovascular Diseases",
-    "Mental Health",
-    "Diabetes Mellitus, Type 1",
-    "Diabetes Mellitus, Type 2"
-]
-
-MESH_TERMS = [
-    '"Noncommunicable+Diseases"[Mesh:noexp]',       # NCDs (All)
-    '"Diabetes+Mellitus"[Mesh:noexp]',              # Diabetes (type 1 or 2)
-    '"Neoplasms"[Mesh:noexp]',                      # Cancer
-    '"Respiratory+Tract+Diseases"[Mesh:noexp]',     # Chronic respiratory disease
-    '"Cardiovascular+Diseases"[Mesh:noexp]',        # Cardiovascular diseases
-    '"Mental+Health"[Mesh:noexp]',                  # Mental Health
-    '"Diabetes+Mellitus%2C+Type+1"[Mesh:noexp]',    # Diabetes type 1
-    '"Diabetes+Mellitus%2C+Type+2"[Mesh:noexp]'     # Diabetes type 2
-]
+from variables.pubmed import NCDS, NCDS_MESH_TERM
 
 date_min = "2024/01/01"
 date_max = "2024/12/31"
-- 
GitLab