From f4f8547147c053b91aea78d9dc20cc7002fe05a2 Mon Sep 17 00:00:00 2001
From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch>
Date: Wed, 12 Mar 2025 17:40:14 +0100
Subject: [PATCH] =?UTF-8?q?Modification=20du=20script=20de=20r=C3=A9cup?=
 =?UTF-8?q?=C3=A9ration=20des=20donn=C3=A9es=20d'API=20PubMed=20et=20scrip?=
 =?UTF-8?q?t=20de=20stokage=20des=20articles=20de=20pubmed=20en=20masse=20?=
 =?UTF-8?q?en=20locale?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../__pycache__/pubmedApi.cpython-313.pyc     | Bin 0 -> 6208 bytes
 dataSources/PubMed/data/save_3_years.json     |   1 +
 dataSources/PubMed/pubmedApi.py               |  75 +++++++++++++++---
 dataSources/PubMed/store_data_localy.py       |  37 +++++++++
 models/LLM/Tokenizer/test.py                  |  13 +++
 parsers/__pycache__/xmlParser.cpython-313.pyc | Bin 0 -> 1489 bytes
 parsers/xmlParser.py                          |  32 ++++----
 7 files changed, 133 insertions(+), 25 deletions(-)
 create mode 100644 dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc
 create mode 100644 dataSources/PubMed/data/save_3_years.json
 create mode 100644 dataSources/PubMed/store_data_localy.py
 create mode 100644 models/LLM/Tokenizer/test.py
 create mode 100644 parsers/__pycache__/xmlParser.cpython-313.pyc

diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e650d570dcc84a349d496c81cb5c4b7a99fc2a2
GIT binary patch
literal 6208
zcmey&%ge>Uz`*cr-Np33;tULrK^z$7gfc!KVPs&K$`H&D%;?Qf1fdy=7-JZMSfDD9
zs33ME4wM<gh*c%be5fiAg+nDbNDdX_P|1s`3dD<HH)RCT3=F|crc4kPOaw+NfPBU5
z&Edsa#0h4zcyoDi7jc8xjNUv&Ji!dXtlqp{d_{avG5#X{V74Lw9PXgI-(m#N{Z=Fx
z%w8lE%o)rf#gN5VB%IEu$@LQCeZMMZJw1I*##_wksU@0>w|EK?i;7bta&z1=b5b=K
zZ}A0$I{CV~#5)Ff#(TR)R<Y{om*f^y@fDON<))^%B$g!VRpjO*BS$pIyUYv>44=z5
zK}jW)RRd%Z0|P^%ECWL*TQH+l1Or2g7>FGNjSQ#)Fcr)MiwKy$6h;PybmnM~nGh8$
zp$vQs#&C5C3`laUp^P}?*g~05<$~Fx;mMYdA(#VXZZIdvgkUa^+F)*D&OCNG204ak
zxDP|&Nhps)jv<sI2up%A1-p}hA&)DRE0~v+fq@~I4@Bm1!}<JRJ`a-11wy$|U56wm
z7|Mex7c2x)9V`qMXTk5rXn015=7TznfsY|rBw7H$GiDAJ4HgR)w`68w$m0#=4HAaQ
zA?Q$!AQ1!)#)8EbI4dv&Gr*Fb0z<SIOihpkjD?_;Kw*c+yga^8zF-M(DDfde0v56g
z4EhXt{Goip4DL)Gj~N*91fr#3W{2{H3V`%Q%fLinZi1&MCI$vyIR=Jc$vi=rYOqN=
z85r_}f~7)*tQi;>Qb4MB7?4w!G?H&+LWP2*K=#1ZDlsq^Fyslt#Dir)vcYm7GE_KN
zRvnZM7(<1F<v}bY_d?AHRsg9CRs@ssAQHC^673im;PznX6K3#`W?;w@36={LfxC{I
zfdOQ<Et4%1$m~!Nh|6?&85oM0i<v-f@|9&^;AT+3V=~ADkjh|?Ka@Z$oc>S(*@DLh
z78pJN>%il7B{JL&HZxQ>80=QCtCc~n4pv4DJy0kmCNMDI^o=seH^C|(5|5wsF#Lq1
z50B53iSe1gAgY^#k<1Pi4hH*69~8%`ARmHRY9JOCKgodn5v&d(@%Tm;$u~$kgc&?!
z85r_JLxqFYKqdsq!}1w|4iyd70Lekp7$VpB%0p6^zcOCkupEJ8D|Yijg@QG~_Gp1g
zZ4fEU;Gw|4kS7)@7OVr}2kU}s98hlNV_*ms3)TaPhKdF2LumsL9c-w_94Z`a1PU<)
z27e|{K0uT^p~471gtEZOk;GDz(!m&3mINsxLITDL6@r<7T9PB|2sQzg5+JwWRACCr
ze|S}xfzmo&73QF{f>(tFEN)QU5z3D>rjbL2Iatza3Ijt5GXsM=lkX}9hETy!W>DxS
zvM?|xfW&Ts#DXnBrB0rpJCpAoNP2K*@_hzjA+;BD7#I+BS14Ps6|5X*V#pJZ1~*ng
zp&O(GB2lptLnwc!c(8S_jU_W?&6Lg)%CEwp$Pmo|)5Xq^D9XT)$BBrIP)=BUDKN-0
z1T*AuI&%7RK<G5aGzME{d4@DbcNSj(2I`o^?E4sGk`4nbb%EMFX-pv76+mIf?6(qP
zry>I`HR(JM6`Hnww-~Be^|Ffd^W3V~LQ;!ztrV)bToOxC6>>B45bVSXD}^fGjFOUq
zVk>?9)Y6j7oMOGa<fKf!yqsLUyvz)}^!zgY)Vz|S)GDa7erj=QVo`F2URrW`rhQ71
zEu<-^mROJ(pPgE1Tg9eUl3J8&TP3Delv<LTpOR_|a)w%NW?o8SNvdrXr&?}e1%%0~
zR$82zky%`lUsP#Z8FGskYIad-acNG;Esny{)S}Al)XH0I<*7-jd1Y1Hfgllu?9@st
zg(}YQ)Fg$}yfQ0=DnAmvkd|74=>xEB5I=CJg{LOD=9SsrV$`o<D%Y-JEiFmYwa{d|
z#gUqqoS%}Jmwrni0Ne(4EGo%N&PfeUEhz$ZC~oolrl#a%=A}AkmL!&B=I2#e>XHjU
zW))|6q$Yyw^T{kOxy2IT>*-R(s$7y<QF4nlB(o$Z^%gtS1-IC}@=J^I5_4|xKt&+(
z9F9rFB}IwJCAWBBtPqfP?tr4yl+0w18;fso`lc3VfYM3vEp8AKYUwRLm(=3qqRfJl
z{35@^+|*lK5Kn^|{<qjYATf7~H=s1hH#H@oGzpZ#s<<2iN|S<15=%;pZ?QpA$1Rq~
z)Wo7&tiJhqB^kGvT@oubnTtTxMv(#o149w0xV*&z*39mimy!xfm24odS}9bqgMDG8
zP{oJrD^L=K`Nm423hZXE^Q{zKf;ttq7|UORn)aHEx7afCQd09uG=)HMmr`7wS(0&!
zwV)_7ucQc63KlUlFfiQWbgf8EEdYhZEf!F6y~UDWkeYXkJ;XO4-o-QM77vKw=IP_=
z=jiKti?zHcvn2Hv3pg4$D{^y6@<Abaixu2^y2X`QoS9c#l9-pAdW$6o6mM*a1qG>j
zDYsZbZmGD%l2V#mpeX?|peXki2PiYd=Vs=?Scw(4IP#L>GfPr)i*K=}q$ZW7-(m%2
zlGIxqX_+~xd7zkJE-lKr#Zi=6T#%nvoO+8f^%fT-W5ySy7T@B8C*~qhIeCi(r0*7U
zep1#g&XmNG#CWi?m_d1qG4mE1D2HdJ-eLhs-C`+7EGoIhkqb=`VA0fDoS<Nd2N_cg
zYRxO)0JmcFGxBp&^)t&7^YmTvlS^|`^Gb^K^O93^L5Axlrl%LBrh{{aZem`FZgNgy
zadBo^W-?evAEYKYzqBYhwOAjLs`Mc#$gv<(ub}c4Z+dD8A_*4xGcYhnGB7X{pJruX
zXkfU<&ND${M$%-x6&%+!EH7$U?ohw3;dD{M=?1^Z6Bf=6w+SW_)H?L9ut?ou;q365
zP%$Cv0+-?y79}+91D?n$Q5hE^GcRQ3UnnTKkYD<NfuW4ihv@?YgD+Dki1J|y15sg2
z@gORJDFZ~sGi8FPOr|mr#a+(S;QWA#?*kJHr|2DVsTszTD>^)G@Cbh3U=Wg;E;~_n
zvV4Qr0|BWS_7?@TS9o0%FmLd@!7Dz4|01u(3W<xnMh$KtS9{OkoKSs%OZf_m3do(F
z6VfI`T;P(s!Xp2GC+b3U%7v)Z3#qvm@=7k`mR{g0!*1sn4hAm1DdH12X9&)*nW1-q
zPjiLI4)sf%P6twMaPahVb#hIxm~K1K_6mpMcV-4IXT~pF4BR69`d#`nH0JBh)V<85
z(cpZ8i}wPb!bL8{2InU%oGtDT1cVwqpYRKHFyG-8osc|3d1A&Amh0TAm$_AM2ncpK
z-jI@=kvvyp0^1E?l@)APh4r3@NM4W%y&@9!KwM*m+XZ8ni#o1X#a(a6E6!lKp{{#D
z&;Fvi!vV32>K-S=FRBO5V3`p%ljnhk)(n;z;WK&fNUJVLyP#`#QO*7U=S4NI6P6d%
zLNA2HUQ~;_l8|y;I`yJ->J1IO3;GThH5?CUUexe85qVJ~{6a+h6^(=iEH{+3R-`Po
zzoDYBLVU6BhL9cV8)Gl3I4$-4$jYFZ$oPSS!O;AIxyMBV&l8##4MHx2#$7aszmSlA
z(I8_1%YyKQybtsZXRyqOyC|W9;S#%xYEf5W60S=pUX)Hmb3@z}jTjW$<u0pO>~J|C
zzSH-jisw}o3y}S>Ap0%sF4*~BG!M8C7<JJ+`a(?BMf2<hEDPc;DwsUb)}6sJBkH1r
z#tmuJ6=D|*94~4*9k9Hp>3dn)_l~^g3bhM{P8YSD4|rbG@;gy=Q7h^~bmm2^tSdPM
z*X0W@$`{_yF@l8mfy|3K0T%+JFY3fxh|RvDle56(hPvJcm8JDJG;~%(UDklPO~do5
zhQ%jV2Hjkc+f1x3Sb1MG1_h~c_=Sk1i^j<pQi?7b7cXF05Oq<(;DMgO43-(O7bUbG
z=;+O0nGt<aLK7*NE;s~UR13NgoN!Ss@k(;~b?J<Y(iw0!U2qG(s1tD^GVO{^`T~v{
z3MvbtFDe*bSFpIKV6j8uf>Y2%%is$k(HAXat|`QRVrJ0IVEn+wAgj1Q`l5{Pbs3Y3
zGA0|sF4*{9H1)gSpK#GM@j_DmMbm;03=Bn#PD~#e7|fhOInss68_e-x`oh2<>&x^&
zMQer4MHP!3k{4B+5BOhE37O$?Ltc4-&qaB|4G~x6Z6D|uY!F-Nb3@l~gZj$Y8`}CC
zSXWwqWM?r@TEMa(Y$4A_E(S%lkNga(p^RUI7$oIpq|D)dCarKsMsb1r3g->XD?Qfx
zt@PVrc3H>ig0}MoC6_BQt{_Rb6($?lR@$u4yP#xwMaBvw<i5gXgZN6{74{dDtgpz}
zfQ0-unC#%YXzYH$$m4>N=M@<*kf7I!k_}!L_3ba{Ib2Y3ydvWS67=4{vLX7Sf#U^z
zrwdBXS7clsNGM*H(6}g}u|njEgx&*b<pqiJY;MRaEby3Jds{*Efr8=!!-Xu18NYZk
zF-R+aWM>c%{uac<z`=WmS8#&H4CgDn3JW~%@CfvKc6m<7Tp+o@@Dh*dcV-4Y#Sd%@
z0wNFiB`$DC{P+^j%pf57nSqg0^bU{U1jT-jPLJOom>D@me|_R);O77FMGLR|cRo-_
zbB)vK*Y_MI20q0ryb56Be-(Q$Ffbfc;SS_uIAq2hXv}a}N+w8x`GSskkQVbrKCVDr
z=8F>Cff~#gHCRDxW3C_}=8Lx6LA=bDcv*vlm@i3i1*x-N(&7rTVAo`M`Tzg_|88zY
zpg}xMkz35gmBqK1@{2(=T}eif38(?Uo|sfzkXVv&izO>RGw&9AN@fwbn&pTG*Lm^r
zpt`oOG_|;-7}8=8EC7v^78QfWKLfzbB2a^)NDE||Kv`l@W@1uKYOx-q`hqky3_x<=
zri6=UP&}w9QKSQE|LB4UP+hi?;WMa5Qsf6x$YGP4pHiBWYF8A;z`y`%)f7)=U|{&b
z%*e=in}O>#1Lp@e21b^4jz*4l{zm=@Ow-vXvQOuq$iIMTIr~EP<va^{Rs^jNUm3nW
zer5cQh|9WeCs;2l_+1w8zrqmkltJPygVt>Z?z;@yw;2Q;@QXF@+~DHrSMOBs*YDJy
zp)_B8ruuxHnK}!KmX|LqUtYhkenZ7&b%z5wm!<qKa|JZ8-{6;T;Q1<2%g88M6wAQC
F0092Jk}m)N

literal 0
HcmV?d00001

diff --git a/dataSources/PubMed/data/save_3_years.json b/dataSources/PubMed/data/save_3_years.json
new file mode 100644
index 000000000..0637a088a
--- /dev/null
+++ b/dataSources/PubMed/data/save_3_years.json
@@ -0,0 +1 @@
+[]
\ No newline at end of file
diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py
index 837125035..8523c34c8 100644
--- a/dataSources/PubMed/pubmedApi.py
+++ b/dataSources/PubMed/pubmedApi.py
@@ -7,6 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from requests import get
 from parsers.xmlParser import parseXmlFile
 import json
+from variables.pubmed import PUBMED_API_KEY
+import xmltodict
 
 TMP_DIR_NAME = "./tmp"
 TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), TMP_DIR_NAME))
@@ -28,9 +30,14 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
     print(f"Date min: {date_min}")
     print(f"Date max: {date_max}")
 
-    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
+    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={PUBMED_API_KEY}&term={term}&retmode=json&mindate={date_min}&maxdate={date_max}&usehistory=y'
 
-    response = get(url)
+    while(True):
+        try:  
+            response = get(url)
+            break
+        except Exception as e:
+            print(e)
 
     search_res = response.json()
 
@@ -42,35 +49,81 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
 
     url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key={query_key}&WebEnv={webenv}"
 
-    response = get(url)
+    while(True):
+        try:
+            response = get(url)
+            break
+        except Exception as e:
+            print(e)
 
     with open(f"{TMP_DIR}/{TMP_FILENAME}", "w+", encoding="utf-8") as file:
         file.write(response.text)
 
-    obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}")
+    # obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}")
+
+    obj = xmltodict.parse(response.text)
+    obj = obj["PubmedArticleSet"]
+
+    print()
 
     data_list = []
 
-    for key in obj.keys():
+    for key in obj:
         if isinstance(obj[key], list):
             i = 0
             for entrie in obj[key]:
                 if "MedlineCitation" in entrie:
+
+                    print("---------------------------------------------------------")
+
                     if "MeshHeadingList" in entrie["MedlineCitation"]:
                         data = {}
-                        data["PMID"] = entrie["MedlineCitation"]["PMID"]
-                        data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
-                        data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
+                        data["PMID"] = entrie["MedlineCitation"]["PMID"]["#text"]
+
+                        data["Title"] = ""
+                        if isinstance(entrie["MedlineCitation"]["Article"]["Journal"]["Title"], list):
+                            for part in entrie["MedlineCitation"]["Article"]["Journal"]["Title"]:
+                                if "#text" in part:
+                                    data["Title"] += part["#text"]
+                            else:
+                                data["Title"] = entrie["MedlineCitation"]["Article"]["Journal"]["Title"]
+                        
+                        data["ArticleTitle"] = ""
+                        if isinstance(entrie["MedlineCitation"]["Article"]["ArticleTitle"], list):
+                            for part in entrie["MedlineCitation"]["Article"]["ArticleTitle"]:
+                                if "#text" in part:
+                                    data["ArticleTitle"] += part["#text"]
+                            else:
+                                data["ArticleTitle"] = entrie["MedlineCitation"]["Article"]["ArticleTitle"]
                         
                         data["Abstract"] = ""
                         if "Abstract" in entrie["MedlineCitation"]["Article"] :
-                            data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
+                            if isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], list):
+                                for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]:
+                                    if "#text" in part:
+                                        data["Abstract"] += part["#text"]
+                            else:
+                                data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
                         
                         data["Predictions"] = []
 
                         data["MeshTerms"] = []
-                        for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]:
-                            data["MeshTerms"].append(meshTerm["DescriptorName"])
+                        if isinstance(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"], list):
+                            for meshTerm in entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]:
+                                data["MeshTerms"].append(meshTerm["DescriptorName"]["#text"])
+                        else:
+                            data["MeshTerms"].append(entrie["MedlineCitation"]["MeshHeadingList"]["MeshHeading"]["DescriptorName"]["#text"])
+
+                        for date in entrie["PubmedData"]["History"]["PubMedPubDate"]:
+                            if date["@PubStatus"] == "pubmed":
+                                data["Date"] = {
+                                    "Year": date["Year"],
+                                    "Month": date["Month"],
+                                    "Day": date["Day"]
+                                }
+                                break
+
+                        print(data)
 
                         if debug:
                             print(f"Index: {obj[key].index(entrie)}")
diff --git a/dataSources/PubMed/store_data_localy.py b/dataSources/PubMed/store_data_localy.py
new file mode 100644
index 000000000..718ff1b6f
--- /dev/null
+++ b/dataSources/PubMed/store_data_localy.py
@@ -0,0 +1,37 @@
+import sys
+import os
+from datetime import datetime, timedelta
+import time
+import json
+
+# Ajouter le répertoire parent au chemin de recherche
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+
+from dataSources.PubMed.pubmedApi import getPubmedData
+from variables.pubmed import *
+from dataSources.PubMed.util import *
+
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
+
+ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM)
+
+search_term = url_encode(" OR ".join(ncds_mesh_noexp))
+
+data = []
+
+with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file:
+    json.dump(data, json_file, indent=4)
+
+current_date = datetime(2022, 1, 1)
+
+while(current_date < datetime(2024, 12, 31)):
+    next_date = current_date + timedelta(weeks=1)
+
+    data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d"))
+
+    current_date = next_date
+
+    time.sleep(0.1)
+
+with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file:
+    json.dump(data, json_file, indent=4)
\ No newline at end of file
diff --git a/models/LLM/Tokenizer/test.py b/models/LLM/Tokenizer/test.py
new file mode 100644
index 000000000..523b01ca6
--- /dev/null
+++ b/models/LLM/Tokenizer/test.py
@@ -0,0 +1,13 @@
+from transformers import AutoTokenizer
+
+# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.)
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+
+# Your text
+text = "Hugging Face makes NLP easy!"
+
+# Tokenize and count tokens
+tokens = tokenizer(text, return_tensors="pt")  # You can also use return_tensors="tf" or "np"
+num_tokens = len(tokens["input_ids"][0])
+
+print(f"Number of tokens: {num_tokens}")
\ No newline at end of file
diff --git a/parsers/__pycache__/xmlParser.cpython-313.pyc b/parsers/__pycache__/xmlParser.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7fac1b757ad2110117af27254fa0d7e20f356b6
GIT binary patch
literal 1489
zcmey&%ge>Uz`)Qv`(pYpCI*JbAPx*OK^dQw7#J9)G6XXOGkP<4F%~l^FnBQ)F)J_x
zGf6QNu>><qF%+?;GitKD1ZnU~W&}}C%*w#Pz|6qF@L7ooWQGAl9%Cp&5L6PG(tvUj
z`573ns|aNbW>jYY%feg+)dQm7`Ymy(2m-kUq!bC8LfLTLQsn6NXAEY@V+v&uX7K1@
zV8~;RW``Qf5Xx){4UIgOQ05>Gm^gwCWwd5sU`UK(V2I|1iAD%xQwef@G!IOTDKre=
zE|+Iuz^yNcAEpmMhqA!jroiAEhfjAXi#`)}vqACUTf@K*;fURLVEG;f25trghEOKB
zTR9mRLK$Ja6eb3SP$m_IP(~F7O(wr8ZimE@lA_F{(vs9-O^#b^5Z*18lGKWlTdc(;
zMVSS+xGHjU;!E=5^OLe}F_$E!-{Q(F&de(=Nz6-5y~UD~SzL09EwP{=H7`Yz^%gfs
zacWL#Zfai1E#{)s;v!H`-eOJ8$jnK(#RaC~Q!<lFib3KEAaE;CKO;XkRX?*VF;Cwm
zKe-g7w^%<fIYl=mu_RG9F+IH~H9fH;Ge1u^F)u|oIVZ8WI5RCX87!n<kXTflT2!oG
zk((0$X6hAG7J(84m;mL6;xh~k3=Ir7xOf_zAFvBuXP3OrE_p*%<GQT=MOpn0<__PB
z98wR|jjpSkUspHZ;eFl4=c0|zb(^4zHbK{ILN3~bbXZ^KmcGa>eVtqJBDdldZsiM6
zaSbky1!X!|?}*ATV7)A=*1`3FTcF>v({cv$Wp3#^{E{=2=V#8$oT;})<vPFqMSlGo
z0>aZhCwflgyDp%5Q9yM;)-?gc2TH2TGZ$tq)Z3tSLD6;sGf2T?z6Y}E6IiEnPUP(N
z`zXvHB>PQ_frDq7^9_FC4(<osd<|ZoK?w^b>w~-r!k<&X#YZ$q0T>5ClP;J8C6pLK
z8DPmVk1>=HUuMl?LdsxFFkLDPnv8y$EVr0kLvFExBj6T$dTL2getro!)HRuJainGD
zq~;~&rru&HDN0Sf#R8Hn2Dwo|L7_;1fq~%`4_IeJZjM`KPHK@P0|SFJ0|P^`8aTG@
zaP#(CbXiRBxXdlv;BteTzu%_QrrWN;<qo&#gp|wN(id2yiv$@M7&O^#iB{z1=%s?J
z(Q}0)y%3O<tYD#A+(=?Y!VC-yU?+m@Tgd=60^%GFo80`A(wtPgA~^;I22fE_EXu&Z
c@PV0;k?}r*(gg;kPb^}L{GSb&8D+rQ0GKBlU;qFB

literal 0
HcmV?d00001

diff --git a/parsers/xmlParser.py b/parsers/xmlParser.py
index 6c88cd638..5a9ecd7a9 100644
--- a/parsers/xmlParser.py
+++ b/parsers/xmlParser.py
@@ -1,26 +1,30 @@
 import xml.etree.ElementTree as ET
 
 def xml_to_obj(xml_element):
-    res = {}   
+    res = {}
     
+    if xml_element.attrib:
+        res["@attributes"] = xml_element.attrib
+
+    text = xml_element.text.strip() if xml_element.text and xml_element.text.strip() else None
+
     for child in xml_element:
-        if child.text:
-            res[child.tag] = child.text
-        else:
-            child_dict = xml_to_obj(child)
+        child_dict = xml_to_obj(child)
 
-            if child.tag in res:
-                if isinstance(res[child.tag], list):
-                    res[child.tag].append(child_dict)
-                else:
-                    res[child.tag] = [res[child.tag], child_dict]
+        if child.tag in res:
+            if isinstance(res[child.tag], list):
+                res[child.tag].append(child_dict)
             else:
-                res[child.tag] = child_dict
-    
+                res[child.tag] = [res[child.tag], child_dict]
+        else:
+            res[child.tag] = child_dict
+
+    if text and not res:
+        return text
+
     return res
 
 def parseXmlFile(filename):
     tree = ET.parse(filename)
     root = tree.getroot()
-
-    return xml_to_obj(root)
\ No newline at end of file
+    return xml_to_obj(root)
-- 
GitLab