diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e650d570dcc84a349d496c81cb5c4b7a99fc2a2 Binary files /dev/null and b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc differ diff --git a/dataSources/PubMed/data/save_3_years.json b/dataSources/PubMed/data/save_3_years.json new file mode 100644 index 0000000000000000000000000000000000000000..0637a088a01e8ddab3bf3fa98dbe804cbde1a0dc --- /dev/null +++ b/dataSources/PubMed/data/save_3_years.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py index 83712503544b3d5c0464b1e9adb6cedb497f7abf..8523c34c8bfe4b1090a354ad457787bd1f1b603c 100644 --- a/dataSources/PubMed/pubmedApi.py +++ b/dataSources/PubMed/pubmedApi.py @@ -7,6 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) from requests import get from parsers.xmlParser import parseXmlFile import json +from variables.pubmed import PUBMED_API_KEY +import xmltodict TMP_DIR_NAME = "./tmp" TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), TMP_DIR_NAME)) @@ -28,9 +30,14 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store print(f"Date min: {date_min}") print(f"Date max: {date_max}") - url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y' + url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y' - response = get(url) + while(True): + try: + response = get(url) + break + except Exception as e: + print(e) search_res = response.json() @@ -42,35 +49,81 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}" - response = get(url) + while(True): + try: + response = get(url) + break + except Exception as e: + print(e) with open(f"{TMP_DIR}/{TMP_FILENAME}", "w+", encoding="utf-8") as file: file.write(response.text) - obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") + # obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") + + obj = xmltodict.parse(response.text) + obj = obj["PubmedArticleSet"] + + print() data_list = [] - for key in obj.keys(): + for key in obj: if isinstance(obj[key], list): i = 0 for entrie in obj[key]: if "MedlineCitation" in entrie: + + print("---------------------------------------------------------") + if "MeshHeadingList" in entrie["MedlineCitation"]: data = {} - data["PMID"] = entrie["MedlineCitation"]["PMID"] - data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] - data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] + data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"] + + data["Title"] = "" + if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list): + for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]: + if "#text" in part: + data["Title"] += part["#text"] + else: + data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] + + data["ArticleTitle"] = "" + if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list): + for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]: + if "#text" in part: + data["ArticleTitle"] += part["#text"] + else: + data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] data["Abstract"] = "" if "Abstract" in entrie["MedlineCitation"]["Article"] : - data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] + if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list): + for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]: + if "#text" in part: + data["Abstract"] += part["#text"] + else: + data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] data["Predictions"] = [] data["MeshTerms"] = [] - for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]: - data["MeshTerms"].append(meshTerm["DescriptorName"]) + if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list): + for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]: + data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"]) + else: + data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"]) + + for date in entrie["PubmedData"]["History"]["PubMedPubDate"]: + if date["@PubStatus"] == "pubmed": + data["Date"] = { + "Year": date["Year"], + "Month": date["Month"], + "Day": date["Day"] + } + break + + print(data) if debug: print(f"Index: {obj[key].index(entrie)}") diff --git a/dataSources/PubMed/store_data_localy.py b/dataSources/PubMed/store_data_localy.py new file mode 100644 index 0000000000000000000000000000000000000000..718ff1b6f709d20ec3aaffa44344c87a2e13a9fc --- /dev/null +++ b/dataSources/PubMed/store_data_localy.py @@ -0,0 +1,37 @@ +import sys +import os +from datetime import datetime, timedelta +import time +import json + +# Ajouter le répertoire parent au chemin de recherche +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +from dataSources.PubMed.pubmedApi import getPubmedData +from variables.pubmed import * +from dataSources.PubMed.util import * + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) + +ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM) + +search_term = url_encode(" OR ".join(ncds_mesh_noexp)) + +data = [] + +with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: + json.dump(data, json_file, indent=4) + +current_date = datetime(2022, 1, 1) + +while(current_date < datetime(2024, 12, 31)): + next_date = current_date + timedelta(weeks=1) + + data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d")) + + current_date = next_date + + time.sleep(0.1) + +with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: + json.dump(data, json_file, indent=4) \ No newline at end of file diff --git a/models/LLM/Tokenizer/test.py b/models/LLM/Tokenizer/test.py new file mode 100644 index 0000000000000000000000000000000000000000..523b01ca6eb43db3cecd5414d565e297ade8bcfc --- /dev/null +++ b/models/LLM/Tokenizer/test.py @@ -0,0 +1,13 @@ +from transformers import AutoTokenizer + +# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +# Your text +text = "Hugging Face makes NLP easy!" + +# Tokenize and count tokens +tokens = tokenizer(text, return_tensors="pt") # You can also use return_tensors="tf" or "np" +num_tokens = len(tokens["input_ids"][0]) + +print(f"Number of tokens: {num_tokens}") \ No newline at end of file diff --git a/parsers/__pycache__/xmlParser.cpython-313.pyc b/parsers/__pycache__/xmlParser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7fac1b757ad2110117af27254fa0d7e20f356b6 Binary files /dev/null and b/parsers/__pycache__/xmlParser.cpython-313.pyc differ diff --git a/parsers/xmlParser.py b/parsers/xmlParser.py index 6c88cd6386dfe4b6940e13d8b904e124920f6145..5a9ecd7a9a70635978f193c2d4501367135d863d 100644 --- a/parsers/xmlParser.py +++ b/parsers/xmlParser.py @@ -1,26 +1,30 @@ import xml.etree.ElementTree as ET def xml_to_obj(xml_element): - res = {} + res = {} + if xml_element.attrib: + res["@attributes"] = xml_element.attrib + + text = xml_element.text.strip() if xml_element.text and xml_element.text.strip() else None + for child in xml_element: - if child.text: - res[child.tag] = child.text - else: - child_dict = xml_to_obj(child) + child_dict = xml_to_obj(child) - if child.tag in res: - if isinstance(res[child.tag], list): - res[child.tag].append(child_dict) - else: - res[child.tag] = [res[child.tag], child_dict] + if child.tag in res: + if isinstance(res[child.tag], list): + res[child.tag].append(child_dict) else: - res[child.tag] = child_dict - + res[child.tag] = [res[child.tag], child_dict] + else: + res[child.tag] = child_dict + + if text and not res: + return text + return res def parseXmlFile(filename): tree = ET.parse(filename) root = tree.getroot() - - return xml_to_obj(root) \ No newline at end of file + return xml_to_obj(root)