From f4f8547147c053b91aea78d9dc20cc7002fe05a2 Mon Sep 17 00:00:00 2001 From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch> Date: Wed, 12 Mar 2025 17:40:14 +0100 Subject: [PATCH] =?UTF-8?q?Modification=20du=20script=20de=20r=C3=A9cup?= =?UTF-8?q?=C3=A9ration=20des=20donn=C3=A9es=20d'API=20PubMed=20et=20scrip?= =?UTF-8?q?t=20de=20stokage=20des=20articles=20de=20pubmed=20en=20masse=20?= =?UTF-8?q?en=20locale?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../__pycache__/pubmedApi.cpython-313.pyc | Bin 0 -> 6208 bytes dataSources/PubMed/data/save_3_years.json | 1 + dataSources/PubMed/pubmedApi.py | 75 +++++++++++++++--- dataSources/PubMed/store_data_localy.py | 37 +++++++++ models/LLM/Tokenizer/test.py | 13 +++ parsers/__pycache__/xmlParser.cpython-313.pyc | Bin 0 -> 1489 bytes parsers/xmlParser.py | 32 ++++---- 7 files changed, 133 insertions(+), 25 deletions(-) create mode 100644 dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc create mode 100644 dataSources/PubMed/data/save_3_years.json create mode 100644 dataSources/PubMed/store_data_localy.py create mode 100644 models/LLM/Tokenizer/test.py create mode 100644 parsers/__pycache__/xmlParser.cpython-313.pyc diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e650d570dcc84a349d496c81cb5c4b7a99fc2a2 GIT binary patch literal 6208 zcmey&%ge>Uz`*cr-Np33;tULrK^z$7gfc!KVPs&K$`H&D%;?Qf1fdy=7-JZMSfDD9 zs33ME4wM<gh*c%be5fiAg+nDbNDdX_P|1s`3dD<HH)RCT3=F|crc4kPOaw+NfPBU5 z&Edsa#0h4zcyoDi7jc8xjNUv&Ji!dXtlqp{d_{avG5#X{V74Lw9PXgI-(m#N{Z=Fx z%w8lE%o)rf#gN5VB%IEu$@LQCeZMMZJw1I*##_wksU@0>w|EK?i;7bta&z1=b5b=K zZ}A0$I{CV~#5)Ff#(TR)R<Y{om*f^y@fDON<))^%B$g!VRpjO*BS$pIyUYv>44=z5 zK}jW)RRd%Z0|P^%ECWL*TQH+l1Or2g7>FGNjSQ#)Fcr)MiwKy$6h;PybmnM~nGh8$ zp$vQs#&C5C3`laUp^P}?*g~05<$~Fx;mMYdA(#VXZZIdvgkUa^+F)*D&OCNG204ak zxDP|&Nhps)jv<sI2up%A1-p}hA&)DRE0~v+fq@~I4@Bm1!}<JRJ`a-11wy$|U56wm z7|Mex7c2x)9V`qMXTk5rXn015=7TznfsY|rBw7H$GiDAJ4HgR)w`68w$m0#=4HAaQ zA?Q$!AQ1!)#)8EbI4dv&Gr*Fb0z<SIOihpkjD?_;Kw*c+yga^8zF-M(DDfde0v56g z4EhXt{Goip4DL)Gj~N*91fr#3W{2{H3V`%Q%fLinZi1&MCI$vyIR=Jc$vi=rYOqN= z85r_}f~7)*tQi;>Qb4MB7?4w!G?H&+LWP2*K=#1ZDlsq^Fyslt#Dir)vcYm7GE_KN zRvnZM7(<1F<v}bY_d?AHRsg9CRs@ssAQHC^673im;PznX6K3#`W?;w@36={LfxC{I zfdOQ<Et4%1$m~!Nh|6?&85oM0i<v-f@|9&^;AT+3V=~ADkjh|?Ka@Z$oc>S(*@DLh z78pJN>%il7B{JL&HZxQ>80=QCtCc~n4pv4DJy0kmCNMDI^o=seH^C|(5|5wsF#Lq1 z50B53iSe1gAgY^#k<1Pi4hH*69~8%`ARmHRY9JOCKgodn5v&d(@%Tm;$u~$kgc&?! z85r_JLxqFYKqdsq!}1w|4iyd70Lekp7$VpB%0p6^zcOCkupEJ8D|Yijg@QG~_Gp1g zZ4fEU;Gw|4kS7)@7OVr}2kU}s98hlNV_*ms3)TaPhKdF2LumsL9c-w_94Z`a1PU<) z27e|{K0uT^p~471gtEZOk;GDz(!m&3mINsxLITDL6@r<7T9PB|2sQzg5+JwWRACCr ze|S}xfzmo&73QF{f>(tFEN)QU5z3D>rjbL2Iatza3Ijt5GXsM=lkX}9hETy!W>DxS zvM?|xfW&Ts#DXnBrB0rpJCpAoNP2K*@_hzjA+;BD7#I+BS14Ps6|5X*V#pJZ1~*ng zp&O(GB2lptLnwc!c(8S_jU_W?&6Lg)%CEwp$Pmo|)5Xq^D9XT)$BBrIP)=BUDKN-0 z1T*AuI&%7RK<G5aGzME{d4@DbcNSj(2I`o^?E4sGk`4nbb%EMFX-pv76+mIf?6(qP zry>I`HR(JM6`Hnww-~Be^|Ffd^W3V~LQ;!ztrV)bToOxC6>>B45bVSXD}^fGjFOUq zVk>?9)Y6j7oMOGa<fKf!yqsLUyvz)}^!zgY)Vz|S)GDa7erj=QVo`F2URrW`rhQ71 zEu<-^mROJ(pPgE1Tg9eUl3J8&TP3Delv<LTpOR_|a)w%NW?o8SNvdrXr&?}e1%%0~ zR$82zky%`lUsP#Z8FGskYIad-acNG;Esny{)S}Al)XH0I<*7-jd1Y1Hfgllu?9@st zg(}YQ)Fg$}yfQ0=DnAmvkd|74=>xEB5I=CJg{LOD=9SsrV$`o<D%Y-JEiFmYwa{d| z#gUqqoS%}Jmwrni0Ne(4EGo%N&PfeUEhz$ZC~oolrl#a%=A}AkmL!&B=I2#e>XHjU zW))|6q$Yyw^T{kOxy2IT>*-R(s$7y<QF4nlB(o$Z^%gtS1-IC}@=J^I5_4|xKt&+( z9F9rFB}IwJCAWBBtPqfP?tr4yl+0w18;fso`lc3VfYM3vEp8AKYUwRLm(=3qqRfJl z{35@^+|*lK5Kn^|{<qjYATf7~H=s1hH#H@oGzpZ#s<<2iN|S<15=%;pZ?QpA$1Rq~ z)Wo7&tiJhqB^kGvT@oubnTtTxMv(#o149w0xV*&z*39mimy!xfm24odS}9bqgMDG8 zP{oJrD^L=K`Nm423hZXE^Q{zKf;ttq7|UORn)aHEx7afCQd09uG=)HMmr`7wS(0&! zwV)_7ucQc63KlUlFfiQWbgf8EEdYhZEf!F6y~UDWkeYXkJ;XO4-o-QM77vKw=IP_= z=jiKti?zHcvn2Hv3pg4$D{^y6@<Abaixu2^y2X`QoS9c#l9-pAdW$6o6mM*a1qG>j zDYsZbZmGD%l2V#mpeX?|peXki2PiYd=Vs=?Scw(4IP#L>GfPr)i*K=}q$ZW7-(m%2 zlGIxqX_+~xd7zkJE-lKr#Zi=6T#%nvoO+8f^%fT-W5ySy7T@B8C*~qhIeCi(r0*7U zep1#g&XmNG#CWi?m_d1qG4mE1D2HdJ-eLhs-C`+7EGoIhkqb=`VA0fDoS<Nd2N_cg zYRxO)0JmcFGxBp&^)t&7^YmTvlS^|`^Gb^K^O93^L5Axlrl%LBrh{{aZem`FZgNgy zadBo^W-?evAEYKYzqBYhwOAjLs`Mc#$gv<(ub}c4Z+dD8A_*4xGcYhnGB7X{pJruX zXkfU<&ND${M$%-x6&%+!EH7$U?ohw3;dD{M=?1^Z6Bf=6w+SW_)H?L9ut?ou;q365 zP%$Cv0+-?y79}+91D?n$Q5hE^GcRQ3UnnTKkYD<NfuW4ihv@?YgD+Dki1J|y15sg2 z@gORJDFZ~sGi8FPOr|mr#a+(S;QWA#?*kJHr|2DVsTszTD>^)G@Cbh3U=Wg;E;~_n zvV4Qr0|BWS_7?@TS9o0%FmLd@!7Dz4|01u(3W<xnMh$KtS9{OkoKSs%OZf_m3do(F z6VfI`T;P(s!Xp2GC+b3U%7v)Z3#qvm@=7k`mR{g0!*1sn4hAm1DdH12X9&)*nW1-q zPjiLI4)sf%P6twMaPahVb#hIxm~K1K_6mpMcV-4IXT~pF4BR69`d#`nH0JBh)V<85 z(cpZ8i}wPb!bL8{2InU%oGtDT1cVwqpYRKHFyG-8osc|3d1A&Amh0TAm$_AM2ncpK z-jI@=kvvyp0^1E?l@)APh4r3@NM4W%y&@9!KwM*m+XZ8ni#o1X#a(a6E6!lKp{{#D z&;Fvi!vV32>K-S=FRBO5V3`p%ljnhk)(n;z;WK&fNUJVLyP#`#QO*7U=S4NI6P6d% zLNA2HUQ~;_l8|y;I`yJ->J1IO3;GThH5?CUUexe85qVJ~{6a+h6^(=iEH{+3R-`Po zzoDYBLVU6BhL9cV8)Gl3I4$-4$jYFZ$oPSS!O;AIxyMBV&l8##4MHx2#$7aszmSlA z(I8_1%YyKQybtsZXRyqOyC|W9;S#%xYEf5W60S=pUX)Hmb3@z}jTjW$<u0pO>~J|C zzSH-jisw}o3y}S>Ap0%sF4*~BG!M8C7<JJ+`a(?BMf2<hEDPc;DwsUb)}6sJBkH1r z#tmuJ6=D|*94~4*9k9Hp>3dn)_l~^g3bhM{P8YSD4|rbG@;gy=Q7h^~bmm2^tSdPM z*X0W@$`{_yF@l8mfy|3K0T%+JFY3fxh|RvDle56(hPvJcm8JDJG;~%(UDklPO~do5 zhQ%jV2Hjkc+f1x3Sb1MG1_h~c_=Sk1i^j<pQi?7b7cXF05Oq<(;DMgO43-(O7bUbG z=;+O0nGt<aLK7*NE;s~UR13NgoN!Ss@k(;~b?J<Y(iw0!U2qG(s1tD^GVO{^`T~v{ z3MvbtFDe*bSFpIKV6j8uf>Y2%%is$k(HAXat|`QRVrJ0IVEn+wAgj1Q`l5{Pbs3Y3 zGA0|sF4*{9H1)gSpK#GM@j_DmMbm;03=Bn#PD~#e7|fhOInss68_e-x`oh2<>&x^& zMQer4MHP!3k{4B+5BOhE37O$?Ltc4-&qaB|4G~x6Z6D|uY!F-Nb3@l~gZj$Y8`}CC zSXWwqWM?r@TEMa(Y$4A_E(S%lkNga(p^RUI7$oIpq|D)dCarKsMsb1r3g->XD?Qfx zt@PVrc3H>ig0}MoC6_BQt{_Rb6($?lR@$u4yP#xwMaBvw<i5gXgZN6{74{dDtgpz} zfQ0-unC#%YXzYH$$m4>N=M@<*kf7I!k_}!L_3ba{Ib2Y3ydvWS67=4{vLX7Sf#U^z zrwdBXS7clsNGM*H(6}g}u|njEgx&*b<pqiJY;MRaEby3Jds{*Efr8=!!-Xu18NYZk zF-R+aWM>c%{uac<z`=WmS8#&H4CgDn3JW~%@CfvKc6m<7Tp+o@@Dh*dcV-4Y#Sd%@ z0wNFiB`$DC{P+^j%pf57nSqg0^bU{U1jT-jPLJOom>D@me|_R);O77FMGLR|cRo-_ zbB)vK*Y_MI20q0ryb56Be-(Q$Ffbfc;SS_uIAq2hXv}a}N+w8x`GSskkQVbrKCVDr z=8F>Cff~#gHCRDxW3C_}=8Lx6LA=bDcv*vlm@i3i1*x-N(&7rTVAo`M`Tzg_|88zY zpg}xMkz35gmBqK1@{2(=T}eif38(?Uo|sfzkXVv&izO>RGw&9AN@fwbn&pTG*Lm^r zpt`oOG_|;-7}8=8EC7v^78QfWKLfzbB2a^)NDE||Kv`l@W@1uKYOx-q`hqky3_x<= zri6=UP&}w9QKSQE|LB4UP+hi?;WMa5Qsf6x$YGP4pHiBWYF8A;z`y`%)f7)=U|{&b z%*e=in}O>#1Lp@e21b^4jz*4l{zm=@Ow-vXvQOuq$iIMTIr~EP<va^{Rs^jNUm3nW zer5cQh|9WeCs;2l_+1w8zrqmkltJPygVt>Z?z;@yw;2Q;@QXF@+~DHrSMOBs*YDJy zp)_B8ruuxHnK}!KmX|LqUtYhkenZ7&b%z5wm!<qKa|JZ8-{6;T;Q1<2%g88M6wAQC F0092Jk}m)N literal 0 HcmV?d00001 diff --git a/dataSources/PubMed/data/save_3_years.json b/dataSources/PubMed/data/save_3_years.json new file mode 100644 index 000000000..0637a088a --- /dev/null +++ b/dataSources/PubMed/data/save_3_years.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py index 837125035..8523c34c8 100644 --- a/dataSources/PubMed/pubmedApi.py +++ b/dataSources/PubMed/pubmedApi.py @@ -7,6 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) from requests import get from parsers.xmlParser import parseXmlFile import json +from variables.pubmed import PUBMED_API_KEY +import xmltodict TMP_DIR_NAME = "./tmp" TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), TMP_DIR_NAME)) @@ -28,9 +30,14 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store print(f"Date min: {date_min}") print(f"Date max: {date_max}") - url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y' + url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y' - response = get(url) + while(True): + try: + response = get(url) + break + except Exception as e: + print(e) search_res = response.json() @@ -42,35 +49,81 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}" - response = get(url) + while(True): + try: + response = get(url) + break + except Exception as e: + print(e) with open(f"{TMP_DIR}/{TMP_FILENAME}", "w+", encoding="utf-8") as file: file.write(response.text) - obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") + # obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") + + obj = xmltodict.parse(response.text) + obj = obj["PubmedArticleSet"] + + print() data_list = [] - for key in obj.keys(): + for key in obj: if isinstance(obj[key], list): i = 0 for entrie in obj[key]: if "MedlineCitation" in entrie: + + print("---------------------------------------------------------") + if "MeshHeadingList" in entrie["MedlineCitation"]: data = {} - data["PMID"] = entrie["MedlineCitation"]["PMID"] - data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] - data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] + data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"] + + data["Title"] = "" + if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list): + for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]: + if "#text" in part: + data["Title"] += part["#text"] + else: + data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"] + + data["ArticleTitle"] = "" + if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list): + for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]: + if "#text" in part: + data["ArticleTitle"] += part["#text"] + else: + data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"] data["Abstract"] = "" if "Abstract" in entrie["MedlineCitation"]["Article"] : - data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] + if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list): + for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]: + if "#text" in part: + data["Abstract"] += part["#text"] + else: + data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] data["Predictions"] = [] data["MeshTerms"] = [] - for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]: - data["MeshTerms"].append(meshTerm["DescriptorName"]) + if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list): + for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]: + data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"]) + else: + data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"]) + + for date in entrie["PubmedData"]["History"]["PubMedPubDate"]: + if date["@PubStatus"] == "pubmed": + data["Date"] = { + "Year": date["Year"], + "Month": date["Month"], + "Day": date["Day"] + } + break + + print(data) if debug: print(f"Index: {obj[key].index(entrie)}") diff --git a/dataSources/PubMed/store_data_localy.py b/dataSources/PubMed/store_data_localy.py new file mode 100644 index 000000000..718ff1b6f --- /dev/null +++ b/dataSources/PubMed/store_data_localy.py @@ -0,0 +1,37 @@ +import sys +import os +from datetime import datetime, timedelta +import time +import json + +# Ajouter le répertoire parent au chemin de recherche +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +from dataSources.PubMed.pubmedApi import getPubmedData +from variables.pubmed import * +from dataSources.PubMed.util import * + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) + +ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM) + +search_term = url_encode(" OR ".join(ncds_mesh_noexp)) + +data = [] + +with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: + json.dump(data, json_file, indent=4) + +current_date = datetime(2022, 1, 1) + +while(current_date < datetime(2024, 12, 31)): + next_date = current_date + timedelta(weeks=1) + + data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d")) + + current_date = next_date + + time.sleep(0.1) + +with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: + json.dump(data, json_file, indent=4) \ No newline at end of file diff --git a/models/LLM/Tokenizer/test.py b/models/LLM/Tokenizer/test.py new file mode 100644 index 000000000..523b01ca6 --- /dev/null +++ b/models/LLM/Tokenizer/test.py @@ -0,0 +1,13 @@ +from transformers import AutoTokenizer + +# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.) +tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + +# Your text +text = "Hugging Face makes NLP easy!" + +# Tokenize and count tokens +tokens = tokenizer(text, return_tensors="pt") # You can also use return_tensors="tf" or "np" +num_tokens = len(tokens["input_ids"][0]) + +print(f"Number of tokens: {num_tokens}") \ No newline at end of file diff --git a/parsers/__pycache__/xmlParser.cpython-313.pyc b/parsers/__pycache__/xmlParser.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7fac1b757ad2110117af27254fa0d7e20f356b6 GIT binary patch literal 1489 zcmey&%ge>Uz`)Qv`(pYpCI*JbAPx*OK^dQw7#J9)G6XXOGkP<4F%~l^FnBQ)F)J_x zGf6QNu>><qF%+?;GitKD1ZnU~W&}}C%*w#Pz|6qF@L7ooWQGAl9%Cp&5L6PG(tvUj z`573ns|aNbW>jYY%feg+)dQm7`Ymy(2m-kUq!bC8LfLTLQsn6NXAEY@V+v&uX7K1@ zV8~;RW``Qf5Xx){4UIgOQ05>Gm^gwCWwd5sU`UK(V2I|1iAD%xQwef@G!IOTDKre= zE|+Iuz^yNcAEpmMhqA!jroiAEhfjAXi#`)}vqACUTf@K*;fURLVEG;f25trghEOKB zTR9mRLK$Ja6eb3SP$m_IP(~F7O(wr8ZimE@lA_F{(vs9-O^#b^5Z*18lGKWlTdc(; zMVSS+xGHjU;!E=5^OLe}F_$E!-{Q(F&de(=Nz6-5y~UD~SzL09EwP{=H7`Yz^%gfs zacWL#Zfai1E#{)s;v!H`-eOJ8$jnK(#RaC~Q!<lFib3KEAaE;CKO;XkRX?*VF;Cwm zKe-g7w^%<fIYl=mu_RG9F+IH~H9fH;Ge1u^F)u|oIVZ8WI5RCX87!n<kXTflT2!oG zk((0$X6hAG7J(84m;mL6;xh~k3=Ir7xOf_zAFvBuXP3OrE_p*%<GQT=MOpn0<__PB z98wR|jjpSkUspHZ;eFl4=c0|zb(^4zHbK{ILN3~bbXZ^KmcGa>eVtqJBDdldZsiM6 zaSbky1!X!|?}*ATV7)A=*1`3FTcF>v({cv$Wp3#^{E{=2=V#8$oT;})<vPFqMSlGo z0>aZhCwflgyDp%5Q9yM;)-?gc2TH2TGZ$tq)Z3tSLD6;sGf2T?z6Y}E6IiEnPUP(N z`zXvHB>PQ_frDq7^9_FC4(<osd<|ZoK?w^b>w~-r!k<&X#YZ$q0T>5ClP;J8C6pLK z8DPmVk1>=HUuMl?LdsxFFkLDPnv8y$EVr0kLvFExBj6T$dTL2getro!)HRuJainGD zq~;~&rru&HDN0Sf#R8Hn2Dwo|L7_;1fq~%`4_IeJZjM`KPHK@P0|SFJ0|P^`8aTG@ zaP#(CbXiRBxXdlv;BteTzu%_QrrWN;<qo&#gp|wN(id2yiv$@M7&O^#iB{z1=%s?J z(Q}0)y%3O<tYD#A+(=?Y!VC-yU?+m@Tgd=60^%GFo80`A(wtPgA~^;I22fE_EXu&Z c@PV0;k?}r*(gg;kPb^}L{GSb&8D+rQ0GKBlU;qFB literal 0 HcmV?d00001 diff --git a/parsers/xmlParser.py b/parsers/xmlParser.py index 6c88cd638..5a9ecd7a9 100644 --- a/parsers/xmlParser.py +++ b/parsers/xmlParser.py @@ -1,26 +1,30 @@ import xml.etree.ElementTree as ET def xml_to_obj(xml_element): - res = {} + res = {} + if xml_element.attrib: + res["@attributes"] = xml_element.attrib + + text = xml_element.text.strip() if xml_element.text and xml_element.text.strip() else None + for child in xml_element: - if child.text: - res[child.tag] = child.text - else: - child_dict = xml_to_obj(child) + child_dict = xml_to_obj(child) - if child.tag in res: - if isinstance(res[child.tag], list): - res[child.tag].append(child_dict) - else: - res[child.tag] = [res[child.tag], child_dict] + if child.tag in res: + if isinstance(res[child.tag], list): + res[child.tag].append(child_dict) else: - res[child.tag] = child_dict - + res[child.tag] = [res[child.tag], child_dict] + else: + res[child.tag] = child_dict + + if text and not res: + return text + return res def parseXmlFile(filename): tree = ET.parse(filename) root = tree.getroot() - - return xml_to_obj(root) \ No newline at end of file + return xml_to_obj(root) -- GitLab