diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc index ff19b58d5a7d6f2c548d646fb5d9087fe9ffacd0..8efc23ccdcf2d6565397f9fe5b909684064402a2 100644 Binary files a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc and b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc differ diff --git a/dataSources/PubMed/data_num_locale.py b/dataSources/PubMed/data_num_locale.py new file mode 100644 index 0000000000000000000000000000000000000000..c08b13da0b55c47df6685ae59d1d1c934f3274d6 --- /dev/null +++ b/dataSources/PubMed/data_num_locale.py @@ -0,0 +1,72 @@ +import json +import sys +import os +import statistics +from datetime import datetime, timedelta + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) + +file_path = f"{DATA_DIR}/save_3_years.json" + +def match_mesh_terms(article_mesh_terms, ncd, keyword): + if ncd in article_mesh_terms: + if isinstance(keyword, list): + all_in = True + for k in keyword: + if k not in article_mesh_terms: + all_in = False + + return all_in + else: + return keyword in article_mesh_terms + else: + return False + +def filter_articles(articles, start_date, end_date, ncd, keyword): + filtered_articles = [] + + for article in articles: + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + + if start_date <= article_date < end_date: + + mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] + if match_mesh_terms(mesh_terms, ncd, keyword): + filtered_articles.append(article) + + return filtered_articles + +with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + +print(len(data)) + + +ncds_mesh_terms = [mesh_term.lower() for mesh_term in NCDS_MESH_TERM] +keywords_mesh_terms = [mesh_term.lower() for mesh_term in KEYWORDS_MESH_TERM] +keywords_subheading_mesh_terms = keywords_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SUBHEADING] +keywords_site_proposition_mesh_terms = keywords_subheading_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SITE_PROPOSITION] +keywords_proposition_mesh_terms = keywords_site_proposition_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_PROPOSITION] + +for ncd in ncds_mesh_terms: + count = [] + + for keyword in keywords_mesh_terms: + start_date = datetime(2022, 1, 1) + + while(start_date < datetime(2024, 12, 31)): + end_date = start_date + timedelta(weeks=1) + + count.append(len(filter_articles(data, start_date, end_date, ncd, keyword))) + + start_date = end_date + + print(f"NCD: {ncd}") + print(f"Min: {min(count)}") + print(f"Max: {max(count)}") + print(f"Mean: {statistics.mean(count)}") + print(f"Median: {statistics.median(count)}") diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py index bf08b5d9f657a4c706a3939d2f223085dfb4b281..d41b83d5f3b828525d1630d378103089fc4c945e 100644 --- a/dataSources/PubMed/pubmedApi.py +++ b/dataSources/PubMed/pubmedApi.py @@ -85,7 +85,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]: if "#text" in part: data["Title"] += part["#text"] - elif not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str): + elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str): data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"] else: data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] @@ -95,7 +95,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]: if "#text" in part: data["ArticleTitle"] += part["#text"] - elif not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str): + elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str): data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"] else: data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] @@ -106,6 +106,8 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]: if "#text" in part: data["Abstract"] += part["#text"] + elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str): + data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"] else: data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] diff --git a/testModel/dataset/create_test_data.py b/testModel/dataset/create_test_data.py index c618a35a221026f143e70e46554cb38d0612dbdd..8d9df36c70c90e2ddd48f8f1b5e55d38abbe6fb8 100644 --- a/testModel/dataset/create_test_data.py +++ b/testModel/dataset/create_test_data.py @@ -7,39 +7,7 @@ import time sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) from folders.dataSources.PubMed.pubmedApi import getPubmedData - -LABELS = [ - "Noncommunicable Diseases", - "Diabetes", - "Cancer", - "Chronic respiratory disease", - "Cardiovascular diseases", - "Mental Health", - "Diabetes type 1", - "Diabetes type 2" -] - -MESH = [ - "Noncommunicable Diseases", - "Diabetes Mellitus", - "Neoplasms", - "Respiratory Tract Diseases", - "Cardiovascular Diseases", - "Mental Health", - "Diabetes Mellitus, Type 1", - "Diabetes Mellitus, Type 2" -] - -MESH_TERMS = [ - '"Noncommunicable+Diseases"[Mesh:noexp]', # NCDs (All) - '"Diabetes+Mellitus"[Mesh:noexp]', # Diabetes (type 1 or 2) - '"Neoplasms"[Mesh:noexp]', # Cancer - '"Respiratory+Tract+Diseases"[Mesh:noexp]', # Chronic respiratory disease - '"Cardiovascular+Diseases"[Mesh:noexp]', # Cardiovascular diseases - '"Mental+Health"[Mesh:noexp]', # Mental Health - '"Diabetes+Mellitus%2C+Type+1"[Mesh:noexp]', # Diabetes type 1 - '"Diabetes+Mellitus%2C+Type+2"[Mesh:noexp]' # Diabetes type 2 -] +from variables.pubmed import NCDS, NCDS_MESH_TERM date_min = "2024/01/01" date_max = "2024/12/31"