Skip to content
Snippets Groups Projects
Commit b57f6cef authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Counting number of published articles from local pubmed save (not working to slow)

parent 90b0177b
Branches
No related tags found
No related merge requests found
No preview for this file type
import json
import sys
import os
import statistics
from datetime import datetime, timedelta
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
file_path = f"{DATA_DIR}/save_3_years.json"
def match_mesh_terms(article_mesh_terms, ncd, keyword):
if ncd in article_mesh_terms:
if isinstance(keyword, list):
all_in = True
for k in keyword:
if k not in article_mesh_terms:
all_in = False
return all_in
else:
return keyword in article_mesh_terms
else:
return False
def filter_articles(articles, start_date, end_date, ncd, keyword):
filtered_articles = []
for article in articles:
article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"]))
if start_date <= article_date < end_date:
mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]]
if match_mesh_terms(mesh_terms, ncd, keyword):
filtered_articles.append(article)
return filtered_articles
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
print(len(data))
ncds_mesh_terms = [mesh_term.lower() for mesh_term in NCDS_MESH_TERM]
keywords_mesh_terms = [mesh_term.lower() for mesh_term in KEYWORDS_MESH_TERM]
keywords_subheading_mesh_terms = keywords_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SUBHEADING]
keywords_site_proposition_mesh_terms = keywords_subheading_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SITE_PROPOSITION]
keywords_proposition_mesh_terms = keywords_site_proposition_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_PROPOSITION]
for ncd in ncds_mesh_terms:
count = []
for keyword in keywords_mesh_terms:
start_date = datetime(2022, 1, 1)
while(start_date < datetime(2024, 12, 31)):
end_date = start_date + timedelta(weeks=1)
count.append(len(filter_articles(data, start_date, end_date, ncd, keyword)))
start_date = end_date
print(f"NCD: {ncd}")
print(f"Min: {min(count)}")
print(f"Max: {max(count)}")
print(f"Mean: {statistics.mean(count)}")
print(f"Median: {statistics.median(count)}")
......@@ -85,7 +85,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
if "#text" in part:
data["Title"] += part["#text"]
elif not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str):
elif "#text" in entrie["MedlineCitation"]["Article"]["Journal"]["Title"] and not isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], str):
data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]["#text"]
else:
data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
......@@ -95,7 +95,7 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
if "#text" in part:
data["ArticleTitle"] += part["#text"]
elif not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str):
elif "#text" in entrie["MedlineCitation"]["Article"]["ArticleTitle"] and not isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], str):
data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]["#text"]
else:
data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
......@@ -106,6 +106,8 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
if "#text" in part:
data["Abstract"] += part["#text"]
elif "#text" in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] and not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str):
data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"]
else:
data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
......
......@@ -7,39 +7,7 @@ import time
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
from folders.dataSources.PubMed.pubmedApi import getPubmedData
LABELS = [
"Noncommunicable Diseases",
"Diabetes",
"Cancer",
"Chronic respiratory disease",
"Cardiovascular diseases",
"Mental Health",
"Diabetes type 1",
"Diabetes type 2"
]
MESH = [
"Noncommunicable Diseases",
"Diabetes Mellitus",
"Neoplasms",
"Respiratory Tract Diseases",
"Cardiovascular Diseases",
"Mental Health",
"Diabetes Mellitus, Type 1",
"Diabetes Mellitus, Type 2"
]
MESH_TERMS = [
'"Noncommunicable+Diseases"[Mesh:noexp]', # NCDs (All)
'"Diabetes+Mellitus"[Mesh:noexp]', # Diabetes (type 1 or 2)
'"Neoplasms"[Mesh:noexp]', # Cancer
'"Respiratory+Tract+Diseases"[Mesh:noexp]', # Chronic respiratory disease
'"Cardiovascular+Diseases"[Mesh:noexp]', # Cardiovascular diseases
'"Mental+Health"[Mesh:noexp]', # Mental Health
'"Diabetes+Mellitus%2C+Type+1"[Mesh:noexp]', # Diabetes type 1
'"Diabetes+Mellitus%2C+Type+2"[Mesh:noexp]' # Diabetes type 2
]
from variables.pubmed import NCDS, NCDS_MESH_TERM
date_min = "2024/01/01"
date_max = "2024/12/31"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment