Avancement test model et nombre de données par jour/semaine/moi

8297804c · ivan.pavlovic · 158da3fd · 8297804c · 8297804c · 8297804c
Commit 8297804c authored 4 months ago by ivan.pavlovic
--- a/.gitignore
+++ b/.gitignore
 gptKey
+geminiKey
 .venv
\ No newline at end of file
--- a/api/data_num.py
+++ b/api/data_num.py
+from requests import get
+from datetime import datetime, timedelta
+import time
+
+TERMS = [
+    # '"Noncommunicable+Diseases"',       # NCDs (All)
+    # '"Diabetes+Mellitus"',              # Diabetes (type 1 or 2)
+    # '"Neoplasms"',                      # Cancer
+    # '"Respiratory+Tract+Diseases"',     # Chronic respiratory disease
+    # '"Cardiovascular+Diseases"',        # Cardiovascular diseases
+    '"Mental+Health"',                  # Mental Health
+    '"Diabetes+Mellitus%2C+Type+1"',    # Diabetes type 1
+    '"Diabetes+Mellitus%2C+Type+2"'     # Diabetes type 2
+]
+
+INTERVALS = [
+    "day",
+    "week",
+    "month"
+]
+
+def get_count_for_year(year, term, interval = "month"):
+    current_date = datetime(year, 1, 1)
+
+    counts = []
+
+    while(current_date < datetime(year, 12, 31)):
+        if interval == "day":
+            next_date = current_date + timedelta(days=1)
+        elif interval == "week":
+            next_date = current_date + timedelta(weeks=1)
+        elif interval == "month":
+            next_date = (current_date.replace(day=28) + timedelta(days=4)).replace(day=1)
+
+        url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}&retmode=json&mindate={current_date.strftime("%Y/%m/%d")}&maxdate={next_date.strftime("%Y/%m/%d")}&usehistory=y'
+        response = get(url)
+        search_res = response.json()
+        counts.append(int(search_res["esearchresult"]["count"]))
+
+        current_date = next_date
+        time.sleep(1)
+
+    max_count = max(counts)
+    min_count = min(counts)
+    avg_count = sum(counts) / len(counts)
+
+    return {"max": max_count, "min": min_count, "avg": avg_count}
+
+data = {}
+
+for term in TERMS:
+    data[term] = {}
+    mesh = term + "[Mesh]"
+    print("TERM: ", mesh)
+
+    for interval in INTERVALS:
+        print("INTERVAL: ", interval)
+        counts = get_count_for_year(2024, mesh, interval)
+        print(counts)
+        data[term][interval] = counts
+
+print(data)
\ No newline at end of file
--- a/api/doc/data_num.json
+++ b/api/doc/data_num.json
+{
+    "\"Noncommunicable+Diseases\"": {
+        "day": {
+            "max": 8,
+            "min": 0,
+            "avg": 2.473972602739726
+        },
+        "week": {
+            "max": 17,
+            "min": 2,
+            "avg": 9.471698113207546
+        },
+        "month": {
+            "max": 54,
+            "min": 20,
+            "avg": 38.5
+        }
+    },
+    "\"Diabetes+Mellitus\"": {
+        "day": {
+            "max": 352,
+            "min": 0,
+            "avg": 108.41643835616438
+        },
+        "week": {
+            "max": 583,
+            "min": 178,
+            "avg": 430.47169811320754
+        },
+        "month": {
+            "max": 2230,
+            "min": 1001,
+            "avg": 1701.75
+        }
+    },
+    "\"Neoplasms\"": {
+        "day": {
+            "max": 2135,
+            "min": 2,
+            "avg": 689.9424657534247
+        },
+        "week": {
+            "max": 3519,
+            "min": 1225,
+            "avg": 2726.6603773584907
+        },
+        "month": {
+            "max": 13160,
+            "min": 6198,
+            "avg": 10845.583333333334
+        }
+    },
+    "\"Respiratory+Tract+Diseases\"": {
+        "day": {
+            "max": 1109,
+            "min": 1,
+            "avg": 356.06575342465754
+        },
+        "week": {
+            "max": 1832,
+            "min": 624,
+            "avg": 1411.188679245283
+        },
+        "month": {
+            "max": 6824,
+            "min": 3360,
+            "avg": 5595.583333333333
+        }
+    },
+    "\"Cardiovascular+Diseases\"": {
+        "day": {
+            "max": 1091,
+            "min": 0,
+            "avg": 390.46301369863016
+        },
+        "week": {
+            "max": 2052,
+            "min": 608,
+            "avg": 1550.9433962264152
+        },
+        "month": {
+            "max": 7519,
+            "min": 3345,
+            "avg": 6134.416666666667
+        }
+    },
+    "\"Mental+Health\"": {
+    "day": {
+      "max": 79,
+      "min": 0,
+      "avg": 26.953424657534246
+    },
+    "week": {
+      "max": 164,
+      "min": 49,
+      "avg": 107.9622641509434
+    },
+    "month": {
+      "max": 532,
+      "min": 259,
+      "avg": 424.0
+    }
+  },
+  "\"Diabetes+Mellitus%2C+Type+1\"": {
+    "day": {
+      "max": 41,
+      "min": 0,
+      "avg": 12.991780821917809
+    },
+    "week": {
+      "max": 89,
+      "min": 19,
+      "avg": 51.886792452830186
+    },
+    "month": {
+      "max": 261,
+      "min": 101,
+      "avg": 203.5
+    }
+  },
+  "\"Diabetes+Mellitus%2C+Type+2\"": {
+    "day": {
+      "max": 168,
+      "min": 0,
+      "avg": 50.24109589041096
+    },
+    "week": {
+      "max": 280,
+      "min": 79,
+      "avg": 199.26415094339623
+    },
+    "month": {
+      "max": 1059,
+      "min": 440,
+      "avg": 789.5833333333334
+    }
+  }
+}
--- a/model/HuggingFace/__pycache__/zero_shot_classification.cpython-313.pyc
+++ b/model/HuggingFace/__pycache__/zero_shot_classification.cpython-313.pyc
--- a/model/HuggingFace/zero_shot_classification.py
+++ b/model/HuggingFace/zero_shot_classification.py
@@ -10,6 +10,14 @@ LABELS = [
    "Diabetes type 2"
 ]

+# LABELS = [
+#     "Neoplasms",
+#     "Diabetes Mellitus",
+#     "Male",
+#     "Blood Cells",
+#     "Arthritis, Infectious"
+# ]
+
 MODELS = [
    "facebook/bart-large-mnli", # https://huggingface.co/facebook/bart-large-mnli
    "MoritzLaurer/bge-m3-zeroshot-v2.0", # https://huggingface.co/MoritzLaurer/bge-m3-zeroshot-v2.0

--- a/model/__pycache__/create_test_data.cpython-313.pyc
+++ b/model/__pycache__/create_test_data.cpython-313.pyc
--- a/model/__pycache__/gemini.cpython-313.pyc
+++ b/model/__pycache__/gemini.cpython-313.pyc
--- a/model/create_test_data.py
+++ b/model/create_test_data.py
+import sys
+import os
+import json
+import time
+
+# Ajouter le répertoire parent au chemin de recherche
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
+
+from api.pubmedApi import getPubmedData
+
+LABELS = [
+    "Noncommunicable Diseases",
+    "Diabetes",
+    "Cancer",
+    "Chronic respiratory disease",
+    "Cardiovascular diseases",
+    "Mental Health",
+    "Diabetes type 1",
+    "Diabetes type 2"
+]
+
+MESH = [
+    "Noncommunicable Diseases",
+    "Diabetes Mellitus",
+    "Neoplasms",
+    "Respiratory Tract Diseases",
+    "Cardiovascular Diseases",
+    "Mental Health",
+    "Diabetes Mellitus, Type 1",
+    "Diabetes Mellitus, Type 2"
+]
+
+MESH_TERMS = [
+    '"Noncommunicable+Diseases"[Mesh]',       # NCDs (All)
+    '"Diabetes+Mellitus"[Mesh]',              # Diabetes (type 1 or 2)
+    '"Neoplasms"[Mesh]',                      # Cancer
+    '"Respiratory+Tract+Diseases"[Mesh]',     # Chronic respiratory disease
+    '"Cardiovascular+Diseases"[Mesh]',        # Cardiovascular diseases
+    '"Mental+Health"[Mesh]',                  # Mental Health
+    '"Diabetes+Mellitus%2C+Type+1"[Mesh]',    # Diabetes type 1
+    '"Diabetes+Mellitus%2C+Type+2"[Mesh]'     # Diabetes type 2
+]
+
+date_min = "2024/01/01"
+date_max = "2024/12/31"
+
+for id_term, mesh_term in enumerate(MESH_TERMS):
+    data_list = getPubmedData(mesh_term, date_min, date_max, nb_items=1000)
+
+    data_store = []
+    i = 0
+
+    for data in data_list:
+        for id_mesh, mesh in enumerate(MESH):
+            if mesh in data["MeshTerms"]:
+                data["Predictions"].append(LABELS[id_mesh])
+        
+        if len(data["Predictions"]) > 0:
+            data_store.append(data)
+            i+=1
+        
+        if len(data_store) >= 20:
+            break
+
+    filename = LABELS[id_term].replace(" ", "_").replace(",", "").lower()
+
+    with open(f"./data/{filename}.json", "w") as json_file:
+        json.dump(data_store, json_file, indent=4)
+
+    time.sleep(1)
\ No newline at end of file
--- a/model/data/arthritis_infectious.json
+++ b/model/data/arthritis_infectious.json
--- a/model/data/blood_cells.json
+++ b/model/data/blood_cells.json
--- a/model/data/cancer.json
+++ b/model/data/cancer.json
--- a/model/data/cardiovascular_diseases.json
+++ b/model/data/cardiovascular_diseases.json
--- a/model/data/chronic_respiratory_disease.json
+++ b/model/data/chronic_respiratory_disease.json
--- a/model/data/diabetes.json
+++ b/model/data/diabetes.json
--- a/model/data/diabetes_mellitus.json
+++ b/model/data/diabetes_mellitus.json
--- a/model/data/diabetes_type_1.json
+++ b/model/data/diabetes_type_1.json
--- a/model/data/diabetes_type_2.json
+++ b/model/data/diabetes_type_2.json
--- a/model/data/male.json
+++ b/model/data/male.json
--- a/model/data/mental_health.json
+++ b/model/data/mental_health.json
--- a/model/data/neoplasms.json
+++ b/model/data/neoplasms.json