From b57f6cef28593feeb9af8c3a72dd188fe2967252 Mon Sep 17 00:00:00 2001 From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch> Date: Thu, 13 Mar 2025 11:48:51 +0100 Subject: [PATCH] Counting number of published articles from local pubmed save (not working to slow) --- .../__pycache__/pubmedApi.cpython-313.pyc | Bin 6559 -> 7064 bytes dataSources/PubMed/data_num_locale.py | 72 ++++++++++++++++++ dataSources/PubMed/pubmedApi.py | 6 +- testModel/dataset/create_test_data.py | 34 +-------- 4 files changed, 77 insertions(+), 35 deletions(-) create mode 100644 dataSources/PubMed/data_num_locale.py diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc index ff19b58d5a7d6f2c548d646fb5d9087fe9ffacd0..8efc23ccdcf2d6565397f9fe5b909684064402a2 100644 GIT binary patch delta 759 zcmbPlJj0ytGcPX}0|Ns?(xOZ0h0+`OoSB#;xHktgr86;ZnY^3Dld)?vD=RxAb7ZjG z<c+LSY}O1642fowb6M3Vhp|gd_F;R)>>DgM`5~JKM9N@tF01t90QM%v6=2po_7<?- z`|L=1r6<2*=Yi@Mn*5HPn>SdFm4SgFR5(~!oq=I<0mmjL{b~k=P{B}Us|W^$L>2}H z1qOF!-&G*7U`t(I28KLAcV^#S1_o}3%=XEF+}oKRGE6?e?V-4ZfkBZ$hd~46F$M;P z6h;PyG$sWGc?JcLViv!u&BZ+OtV}iRo2LjmGSz=zV~|r_!FEC4;i87)0nLjVJ|`kC zYJ^{ih`*?ja9K9tN@nqelBy333^k0tOdlB-N*Vo_J}@x&Getmnksw|aQ!<2?0^+4I z<v@73OkWrn@|lX5zB4dnF&2XaLDUT$qYK6^7j;|@WL})iAu=N#iyPuDYQ$WYjk%JV zf1#l40|P?^qbEpvA)^<_25+Vi2rm@G3u8)w@KQncr88xL>`P<J1_^?w$p=Mtv#~KK zX|8aa>?kUda8W1cvSQAa()te!4313hAR|1OB0;JhnW8|vXo!K1OsODV8dCv8t`Nj4 zVsc>q2-4@s{DpzRnc0o`I|D-*qdQ0tMBOm9oZKKPE^wiw`l4~o0+t0)mlO;h7+Y>` z5WULC6UxLOt^ARlK|uIh+~h=YBgX8>Q^Y-^Dwr7f6tD0qd|+b`5cyT?!N9<9P=z~0 zp5c%gcPI<PVJVqVP3FtC;-S{eSLC@u?3k};a)($lU$JBjv17i<!WF8_e3g$oRF3(o h94m;e$rWnBe$|>QjB~TOL_8y>GZUj^Q5*vU0|3cu&L{u? delta 374 zcmbPXKHr${GcPX}0|NuY${QEc7f5d8b7o?4<Jug|l+MKHJ$W~aC!^wKR#tXKM)%Ez zY?_RWc9RX+BqvW|Z(>ZCtj{Jr*?^;kQE;*%ht%eH&Z$iLLJSO{f}zY-5ey88EDQ_^ z4DQUnjv%pMOI=<DhCD%cW?vNs25yLq-(+Q;?Ti(ZZ}WIEdowUh_U4shacA)p-0aFL z&&m|awmD76k%|2a8-tS03b)C!qT*av(lW0rW?fXwx-r>NRC2P3XqO^X`hu~`MIF}z znHO~eE(FG3)JeFKlzLq;?V@7ZWLdG@EF26bR+}ruuQKukGBHRie`IG65dIc5IZD!q zF@18cq-RtK69b>(6<&o8YzzV-zluE=7#I$!a0i(&95Uk$mS#9CB@@EHd`Uz+M2Pv4 z8CS3<^JNC^U;*aK0<0jmG*_?{^JP`;U{mJHrmVqM%vTt=LipLQ2yuldY&MXJXXJEb MVw5b3V_;wa0H5(-F8}}l diff --git a/dataSources/PubMed/data_num_locale.py b/dataSources/PubMed/data_num_locale.py new file mode 100644 index 000000000..c08b13da0 --- /dev/null +++ b/dataSources/PubMed/data_num_locale.py @@ -0,0 +1,72 @@ +import json +import sys +import os +import statistics +from datetime import datetime, timedelta + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) + +file_path = f"{DATA_DIR}/save_3_years.json" + +def match_mesh_terms(article_mesh_terms, ncd, keyword): + if ncd in article_mesh_terms: + if isinstance(keyword, list): + all_in = True + for k in keyword: + if k not in article_mesh_terms: + all_in = False + + return all_in + else: + return keyword in article_mesh_terms + else: + return False + +def filter_articles(articles, start_date, end_date, ncd, keyword): + filtered_articles = [] + + for article in articles: + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + + if start_date <= article_date < end_date: + + mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] + if match_mesh_terms(mesh_terms, ncd, keyword): + filtered_articles.append(article) + + return filtered_articles + +with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + +print(len(data)) + + +ncds_mesh_terms = [mesh_term.lower() for mesh_term in NCDS_MESH_TERM] +keywords_mesh_terms = [mesh_term.lower() for mesh_term in KEYWORDS_MESH_TERM] +keywords_subheading_mesh_terms = keywords_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SUBHEADING] +keywords_site_proposition_mesh_terms = keywords_subheading_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SITE_PROPOSITION] +keywords_proposition_mesh_terms = keywords_site_proposition_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_PROPOSITION] + +for ncd in ncds_mesh_terms: + count = [] + + for keyword in keywords_mesh_terms: + start_date = datetime(2022, 1, 1) + + while(start_date < datetime(2024, 12, 31)): + end_date = start_date + timedelta(weeks=1) + + count.append(len(filter_articles(data, start_date, end_date, ncd, keyword))) + + start_date = end_date + + print(f"NCD: {ncd}") + print(f"Min: {min(count)}") + print(f"Max: {max(count)}") + print(f"Mean: {statistics.mean(count)}") + print(f"Median: {statistics.median(count)}") diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py index bf08b5d9f..d41b83d5f 100644 --- a/dataSources/PubMed/pubmedApi.py +++ b/dataSources/PubMed/pubmedApi.py @@ -85,7 +85,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]: if "#text" in part: data["Title"] += part["#text"] - elif not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str): + elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str): data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"] else: data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] @@ -95,7 +95,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]: if "#text" in part: data["ArticleTitle"] += part["#text"] - elif not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str): + elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str): data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"] else: data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] @@ -106,6 +106,8 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]: if "#text" in part: data["Abstract"] += part["#text"] + elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str): + data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"] else: data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] diff --git a/testModel/dataset/create_test_data.py b/testModel/dataset/create_test_data.py index c618a35a2..8d9df36c7 100644 --- a/testModel/dataset/create_test_data.py +++ b/testModel/dataset/create_test_data.py @@ -7,39 +7,7 @@ import time sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) from folders.dataSources.PubMed.pubmedApi import getPubmedData - -LABELS = [ - "Noncommunicable Diseases", - "Diabetes", - "Cancer", - "Chronic respiratory disease", - "Cardiovascular diseases", - "Mental Health", - "Diabetes type 1", - "Diabetes type 2" -] - -MESH = [ - "Noncommunicable Diseases", - "Diabetes Mellitus", - "Neoplasms", - "Respiratory Tract Diseases", - "Cardiovascular Diseases", - "Mental Health", - "Diabetes Mellitus, Type 1", - "Diabetes Mellitus, Type 2" -] - -MESH_TERMS = [ - '"Noncommunicable+Diseases"[Mesh:noexp]', # NCDs (All) - '"Diabetes+Mellitus"[Mesh:noexp]', # Diabetes (type 1 or 2) - '"Neoplasms"[Mesh:noexp]', # Cancer - '"Respiratory+Tract+Diseases"[Mesh:noexp]', # Chronic respiratory disease - '"Cardiovascular+Diseases"[Mesh:noexp]', # Cardiovascular diseases - '"Mental+Health"[Mesh:noexp]', # Mental Health - '"Diabetes+Mellitus%2C+Type+1"[Mesh:noexp]', # Diabetes type 1 - '"Diabetes+Mellitus%2C+Type+2"[Mesh:noexp]' # Diabetes type 2 -] +from variables.pubmed import NCDS, NCDS_MESH_TERM date_min = "2024/01/01" date_max = "2024/12/31" -- GitLab