From 088f25471f97b8653a3c2e498471f630d0d0ea7f Mon Sep 17 00:00:00 2001 From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch> Date: Thu, 13 Mar 2025 16:50:33 +0100 Subject: [PATCH] =?UTF-8?q?Cr=C3=A9ation=20du=20script=20de=20comptage=20d?= =?UTF-8?q?'article=20pubmed=20par=20jour/semaine/mois=20depuis=20sauvegar?= =?UTF-8?q?de=20locale?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dataSources/PubMed/data_num_locale.py | 176 +++++++++++++++++++++----- 1 file changed, 146 insertions(+), 30 deletions(-) diff --git a/dataSources/PubMed/data_num_locale.py b/dataSources/PubMed/data_num_locale.py index c08b13da0..618b4d196 100644 --- a/dataSources/PubMed/data_num_locale.py +++ b/dataSources/PubMed/data_num_locale.py @@ -12,6 +12,17 @@ DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) file_path = f"{DATA_DIR}/save_3_years.json" +def get_date_indices(date, start_date): + delta_days = (date - start_date).days + day_index = delta_days + + week_index = (delta_days // 7) + + delta_months = (date.year - start_date.year) * 12 + (date.month - start_date.month) + month_index = delta_months + + return day_index, week_index, month_index + def match_mesh_terms(article_mesh_terms, ncd, keyword): if ncd in article_mesh_terms: if isinstance(keyword, list): @@ -26,47 +37,152 @@ def match_mesh_terms(article_mesh_terms, ncd, keyword): else: return False -def filter_articles(articles, start_date, end_date, ncd, keyword): - filtered_articles = [] - - for article in articles: - article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) - - if start_date <= article_date < end_date: - - mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] - if match_mesh_terms(mesh_terms, ncd, keyword): - filtered_articles.append(article) - - return filtered_articles - with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) print(len(data)) - -ncds_mesh_terms = [mesh_term.lower() for mesh_term in NCDS_MESH_TERM] -keywords_mesh_terms = [mesh_term.lower() for mesh_term in KEYWORDS_MESH_TERM] +ncds_mesh_terms = [mesh_term.lower() for ncd, mesh_term in NCDS_MESH_TERM.items()] +keywords_mesh_terms = [mesh_term.lower() for keyword, mesh_term in KEYWORDS_MESH_TERM.items()] keywords_subheading_mesh_terms = keywords_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SUBHEADING] keywords_site_proposition_mesh_terms = keywords_subheading_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_SITE_PROPOSITION] keywords_proposition_mesh_terms = keywords_site_proposition_mesh_terms + [mesh_term.lower() for mesh_term in KEYWORDS_MESH_PROPOSITION] -for ncd in ncds_mesh_terms: - count = [] +counts = {} +start_date = datetime(2022, 1, 1) + +for article in data: + + mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] + + if "ALL" not in counts: + counts["ALL"] = {} + + for ncd in ncds_mesh_terms: + if ncd not in counts: + counts[ncd] = { + "KEYWORDS" : { + "day": {}, + "week": {}, + "month": {} + }, + "SUBHEADINGS" : { + "day": {}, + "week": {}, + "month": {} + }, + "SITE PROPOSITION" : { + "day": {}, + "week": {}, + "month": {} + }, + "PROPOSITION" : { + "day": {}, + "week": {}, + "month": {} + }, + } + + for keyword in keywords_mesh_terms: + + if match_mesh_terms(mesh_terms, ncd, mesh_terms): + + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + + day_index, week_index, month_index = get_date_indices(article_date, start_date) + + if day_index not in counts[ncd]["KEYWORDS"]["day"]: + counts[ncd]["KEYWORDS"]["day"][day_index] = [] + + if week_index not in counts[ncd]["KEYWORDS"]["week"]: + counts[ncd]["KEYWORDS"]["week"][week_index] = [] + + if month_index not in counts[ncd]["KEYWORDS"]["month"]: + counts[ncd]["KEYWORDS"]["month"][month_index] = [] + + if article["PMID"] not in counts[ncd]["KEYWORDS"]["day"][day_index]: + counts[ncd]["KEYWORDS"]["day"][day_index].append(article["PMID"]) + + if article["PMID"] not in counts[ncd]["KEYWORDS"]["week"][week_index]: + counts[ncd]["KEYWORDS"]["week"][week_index].append(article["PMID"]) + + if article["PMID"] not in counts[ncd]["KEYWORDS"]["month"][month_index]: + counts[ncd]["KEYWORDS"]["month"][month_index].append(article["PMID"]) + + for keyword in keywords_subheading_mesh_terms: + + if match_mesh_terms(mesh_terms, ncd, mesh_terms): + + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + + day_index, week_index, month_index = get_date_indices(article_date, start_date) + + if day_index not in counts[ncd]["SUBHEADINGS"]["day"]: + counts[ncd]["SUBHEADINGS"]["day"][day_index] = [] + + if week_index not in counts[ncd]["SUBHEADINGS"]["week"]: + counts[ncd]["SUBHEADINGS"]["week"][week_index] = [] + + if month_index not in counts[ncd]["SUBHEADINGS"]["month"]: + counts[ncd]["SUBHEADINGS"]["month"][month_index] = [] + + if article["PMID"] not in counts[ncd]["SUBHEADINGS"]["day"][day_index]: + counts[ncd]["SUBHEADINGS"]["day"][day_index].append(article["PMID"]) + + if article["PMID"] not in counts[ncd]["SUBHEADINGS"]["week"][week_index]: + counts[ncd]["SUBHEADINGS"]["week"][week_index].append(article["PMID"]) + + if article["PMID"] not in counts[ncd]["SUBHEADINGS"]["month"][month_index]: + counts[ncd]["SUBHEADINGS"]["month"][month_index].append(article["PMID"]) + + for keyword in keywords_site_proposition_mesh_terms: + + if match_mesh_terms(mesh_terms, ncd, mesh_terms): + + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + + day_index, week_index, month_index = get_date_indices(article_date, start_date) + + if day_index not in counts[ncd]["SITE PROPOSITION"]["day"]: + counts[ncd]["SITE PROPOSITION"]["day"][day_index] = [] + + if week_index not in counts[ncd]["SITE PROPOSITION"]["week"]: + counts[ncd]["SITE PROPOSITION"]["week"][week_index] = [] + + if month_index not in counts[ncd]["SITE PROPOSITION"]["month"]: + counts[ncd]["SITE PROPOSITION"]["month"][month_index] = [] + + if article["PMID"] not in counts[ncd]["SITE PROPOSITION"]["day"][day_index]: + counts[ncd]["SITE PROPOSITION"]["day"][day_index].append(article["PMID"]) + + if article["PMID"] not in counts[ncd]["SITE PROPOSITION"]["week"][week_index]: + counts[ncd]["SITE PROPOSITION"]["week"][week_index].append(article["PMID"]) + + if article["PMID"] not in counts[ncd]["SITE PROPOSITION"]["month"][month_index]: + counts[ncd]["SITE PROPOSITION"]["month"][month_index].append(article["PMID"]) + + for keyword in keywords_proposition_mesh_terms: + + if match_mesh_terms(mesh_terms, ncd, mesh_terms): + + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + + day_index, week_index, month_index = get_date_indices(article_date, start_date) + + if day_index not in counts[ncd]["PROPOSITION"]["day"]: + counts[ncd]["PROPOSITION"]["day"][day_index] = [] - for keyword in keywords_mesh_terms: - start_date = datetime(2022, 1, 1) + if week_index not in counts[ncd]["PROPOSITION"]["week"]: + counts[ncd]["PROPOSITION"]["week"][week_index] = [] - while(start_date < datetime(2024, 12, 31)): - end_date = start_date + timedelta(weeks=1) + if month_index not in counts[ncd]["PROPOSITION"]["month"]: + counts[ncd]["PROPOSITION"]["month"][month_index] = [] - count.append(len(filter_articles(data, start_date, end_date, ncd, keyword))) + if article["PMID"] not in counts[ncd]["PROPOSITION"]["day"][day_index]: + counts[ncd]["PROPOSITION"]["day"][day_index].append(article["PMID"]) - start_date = end_date + if article["PMID"] not in counts[ncd]["PROPOSITION"]["week"][week_index]: + counts[ncd]["PROPOSITION"]["week"][week_index].append(article["PMID"]) - print(f"NCD: {ncd}") - print(f"Min: {min(count)}") - print(f"Max: {max(count)}") - print(f"Mean: {statistics.mean(count)}") - print(f"Median: {statistics.median(count)}") + if article["PMID"] not in counts[ncd]["PROPOSITION"]["month"][month_index]: + counts[ncd]["PROPOSITION"]["month"][month_index].append(article["PMID"]) -- GitLab