Skip to content
Snippets Groups Projects
Commit f4f85471 authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Modification du script de récupération des données d'API PubMed et script de...

Modification du script de récupération des données d'API PubMed et script de stokage des articles de pubmed en masse en locale
parent 88696137
No related branches found
No related tags found
No related merge requests found
File added
[]
\ No newline at end of file
...@@ -7,6 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) ...@@ -7,6 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
from requests import get from requests import get
from parsers.xmlParser import parseXmlFile from parsers.xmlParser import parseXmlFile
import json import json
from variables.pubmed import PUBMED_API_KEY
import xmltodict
TMP_DIR_NAME = "./tmp" TMP_DIR_NAME = "./tmp"
TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), TMP_DIR_NAME)) TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), TMP_DIR_NAME))
...@@ -28,9 +30,14 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store ...@@ -28,9 +30,14 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
print(f"Date min: {date_min}") print(f"Date min: {date_min}")
print(f"Date max: {date_max}") print(f"Date max: {date_max}")
url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y' url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
while(True):
try:
response = get(url) response = get(url)
break
except Exception as e:
print(e)
search_res = response.json() search_res = response.json()
...@@ -42,35 +49,81 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store ...@@ -42,35 +49,81 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}" url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}"
while(True):
try:
response = get(url) response = get(url)
break
except Exception as e:
print(e)
with open(f"{TMP_DIR}/{TMP_FILENAME}", "w+", encoding="utf-8") as file: with open(f"{TMP_DIR}/{TMP_FILENAME}", "w+", encoding="utf-8") as file:
file.write(response.text) file.write(response.text)
obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") # obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}")
obj = xmltodict.parse(response.text)
obj = obj["PubmedArticleSet"]
print()
data_list = [] data_list = []
for key in obj.keys(): for key in obj:
if isinstance(obj[key], list): if isinstance(obj[key], list):
i = 0 i = 0
for entrie in obj[key]: for entrie in obj[key]:
if "MedlineCitation" in entrie: if "MedlineCitation" in entrie:
print("---------------------------------------------------------")
if "MeshHeadingList" in entrie["MedlineCitation"]: if "MeshHeadingList" in entrie["MedlineCitation"]:
data = {} data = {}
data["PMID"] = entrie["MedlineCitation"]["PMID"] data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"]
data["Title"] = ""
if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list):
for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
if "#text" in part:
data["Title"] += part["#text"]
else:
data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
data["ArticleTitle"] = ""
if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list):
for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
if "#text" in part:
data["ArticleTitle"] += part["#text"]
else:
data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
data["Abstract"] = "" data["Abstract"] = ""
if "Abstract" in entrie["MedlineCitation"]["Article"] : if "Abstract" in entrie["MedlineCitation"]["Article"] :
if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list):
for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
if "#text" in part:
data["Abstract"] += part["#text"]
else:
data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
data["Predictions"] = [] data["Predictions"] = []
data["MeshTerms"] = [] data["MeshTerms"] = []
if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list):
for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]: for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]:
data["MeshTerms"].append(meshTerm["DescriptorName"]) data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"])
else:
data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"])
for date in entrie["PubmedData"]["History"]["PubMedPubDate"]:
if date["@PubStatus"] == "pubmed":
data["Date"] = {
"Year": date["Year"],
"Month": date["Month"],
"Day": date["Day"]
}
break
print(data)
if debug: if debug:
print(f"Index: {obj[key].index(entrie)}") print(f"Index: {obj[key].index(entrie)}")
......
import sys
import os
from datetime import datetime, timedelta
import time
import json
# Ajouter le répertoire parent au chemin de recherche
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
from dataSources.PubMed.pubmedApi import getPubmedData
from variables.pubmed import *
from dataSources.PubMed.util import *
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM)
search_term = url_encode(" OR ".join(ncds_mesh_noexp))
data = []
with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file:
json.dump(data, json_file, indent=4)
current_date = datetime(2022, 1, 1)
while(current_date < datetime(2024, 12, 31)):
next_date = current_date + timedelta(weeks=1)
data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d"))
current_date = next_date
time.sleep(0.1)
with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file:
json.dump(data, json_file, indent=4)
\ No newline at end of file
from transformers import AutoTokenizer
# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Your text
text = "Hugging Face makes NLP easy!"
# Tokenize and count tokens
tokens = tokenizer(text, return_tensors="pt") # You can also use return_tensors="tf" or "np"
num_tokens = len(tokens["input_ids"][0])
print(f"Number of tokens: {num_tokens}")
\ No newline at end of file
File added
...@@ -3,10 +3,12 @@ import xml.etree.ElementTree as ET ...@@ -3,10 +3,12 @@ import xml.etree.ElementTree as ET
def xml_to_obj(xml_element): def xml_to_obj(xml_element):
res = {} res = {}
if xml_element.attrib:
res["@attributes"] = xml_element.attrib
text = xml_element.text.strip() if xml_element.text and xml_element.text.strip() else None
for child in xml_element: for child in xml_element:
if child.text:
res[child.tag] = child.text
else:
child_dict = xml_to_obj(child) child_dict = xml_to_obj(child)
if child.tag in res: if child.tag in res:
...@@ -17,10 +19,12 @@ def xml_to_obj(xml_element): ...@@ -17,10 +19,12 @@ def xml_to_obj(xml_element):
else: else:
res[child.tag] = child_dict res[child.tag] = child_dict
if text and not res:
return text
return res return res
def parseXmlFile(filename): def parseXmlFile(filename):
tree = ET.parse(filename) tree = ET.parse(filename)
root = tree.getroot() root = tree.getroot()
return xml_to_obj(root) return xml_to_obj(root)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment