From 623e3f6fc09dad1319445dd2c3a82db3f3155f7c Mon Sep 17 00:00:00 2001 From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch> Date: Mon, 17 Mar 2025 18:38:22 +0100 Subject: [PATCH] =?UTF-8?q?Regeneration=20de=20la=20sauvegarde=20locale=20?= =?UTF-8?q?en=20enlevant=20les=20doublons=20et=20en=20ajoutant=20les=20pot?= =?UTF-8?q?entiels=20articles=20loup=C3=A9s.=20Regeration=20du=20calcule?= =?UTF-8?q?=20d'articles=20post=C3=A9s=20dans=20pubmed=20avec=20le=20champ?= =?UTF-8?q?=20SANS=20KEYWORDS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 - .../__pycache__/pubmedApi.cpython-313.pyc | Bin 6947 -> 6976 bytes .../PubMed/{doc => data}/data_num.json | 0 .../data_num_keyword_no_mesh.json | 0 .../{doc => data}/locale_articles_count.json | 581 +++++++++++------- dataSources/PubMed/data_num_locale.py | 287 +++------ dataSources/PubMed/getPubmedData.py | 15 + dataSources/PubMed/pubmedApi.py | 3 + dataSources/PubMed/store_data_localy.py | 27 +- 9 files changed, 490 insertions(+), 424 deletions(-) rename dataSources/PubMed/{doc => data}/data_num.json (100%) rename dataSources/PubMed/{doc => data}/data_num_keyword_no_mesh.json (100%) rename dataSources/PubMed/{doc => data}/locale_articles_count.json (52%) diff --git a/.gitignore b/.gitignore index 575b7a337..f13d16f8b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ dataSources/PubMed/tmp/* -dataSources/PubMed/data/* .venv \ No newline at end of file diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc index 2b6898c16a135b047d0a3d8a3b5d360a8a9eb13e..30b6928481176e9631551ca208cae0ccac747add 100644 GIT binary patch delta 213 zcmZ2%cEF7HGcPX}0|Ns?pw5l-{*AmYOpJdv2Qk?*2?R@o@>w%7FeEZDFz7R=Fl?U4 zBFn*edGdN5Pv&I|43p(~rC8iq{Bkx+@;+f<tls=jFpZIyU2=i&Wp<St{DK{v4>qR= zZ)Ro8oh%{g8CAu^z^8bHSK$L2gMi4dVh;uehJz~HA%YBt%(z3$7!FIxgi14C))x=e zWxgWF6=J}AMVdQAoB4`1D~N5z6)MVnm4Q1{fcdHbD~K)46{^L4RhKK&dh>3{ct%ba MCPvAkI0gm=07Dx#UjP6A delta 165 zcmX?Lw%CmKGcPX}0|Ns?gX!gTzm2>uOpI?f2Qk?*ZC=PC%fWbh@_rsq=6MVZll6I} zSln6sQZ{SyK4D?Z+sr4F#<;mrcrz<w`eai{PsWnT^^#)FoD7G|xI^?94ok^|3Nv3; z7Y|iszQV~BqQQJcm^(z7`HC_th^@yJ%FBGkf;*Ih`6>r%C@=F>VXjam_N%H~p~jnU SNX9dAIx;az7R50zFaQA6t}7b= diff --git a/dataSources/PubMed/doc/data_num.json b/dataSources/PubMed/data/data_num.json similarity index 100% rename from dataSources/PubMed/doc/data_num.json rename to dataSources/PubMed/data/data_num.json diff --git a/dataSources/PubMed/doc/data_num_keyword_no_mesh.json b/dataSources/PubMed/data/data_num_keyword_no_mesh.json similarity index 100% rename from dataSources/PubMed/doc/data_num_keyword_no_mesh.json rename to dataSources/PubMed/data/data_num_keyword_no_mesh.json diff --git a/dataSources/PubMed/doc/locale_articles_count.json b/dataSources/PubMed/data/locale_articles_count.json similarity index 52% rename from dataSources/PubMed/doc/locale_articles_count.json rename to dataSources/PubMed/data/locale_articles_count.json index cb2b3056f..74066dd3e 100644 --- a/dataSources/PubMed/doc/locale_articles_count.json +++ b/dataSources/PubMed/data/locale_articles_count.json @@ -1,631 +1,784 @@ { + "ALL": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 700, + "mean": 142.64 + }, + "week": { + "min": 0, + "max": 1436, + "mean": 996.7080745341615 + }, + "month": { + "min": 95, + "max": 5131, + "mean": 4337.027027027027 + } + }, + "KEYWORDS": { + "day": { + "min": 0, + "max": 39, + "mean": 6.383111111111111 + }, + "week": { + "min": 0, + "max": 69, + "mean": 44.60248447204969 + }, + "month": { + "min": 9, + "max": 263, + "mean": 194.0810810810811 + } + }, + "SUBHEADINGS": { + "day": { + "min": 0, + "max": 39, + "mean": 6.383111111111111 + }, + "week": { + "min": 0, + "max": 69, + "mean": 44.60248447204969 + }, + "month": { + "min": 9, + "max": 263, + "mean": 194.0810810810811 + } + }, + "SITE PROPOSITION": { + "day": { + "min": 0, + "max": 47, + "mean": 8.278222222222222 + }, + "week": { + "min": 0, + "max": 93, + "mean": 57.84472049689441 + }, + "month": { + "min": 10, + "max": 311, + "mean": 251.7027027027027 + } + }, + "PROPOSITION": { + "day": { + "min": 0, + "max": 68, + "mean": 11.38488888888889 + }, + "week": { + "min": 0, + "max": 124, + "mean": 79.5527950310559 + }, + "month": { + "min": 14, + "max": 432, + "mean": 346.1621621621622 + } + } + }, "noncommunicable diseases": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 7, + "mean": 1.3555555555555556 + }, + "week": { + "min": 0, + "max": 23, + "mean": 9.472049689440993 + }, + "month": { + "min": 0, + "max": 57, + "mean": 41.21621621621622 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 3, - "mean": 0.20639269406392693 + "mean": 0.2008888888888889 }, "week": { "min": 0, "max": 6, - "mean": 1.4394904458598725 + "mean": 1.4037267080745341 }, "month": { - "min": 2, + "min": 0, "max": 12, - "mean": 6.277777777777778 + "mean": 6.108108108108108 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 3, - "mean": 0.20639269406392693 + "mean": 0.2008888888888889 }, "week": { "min": 0, "max": 6, - "mean": 1.4394904458598725 + "mean": 1.4037267080745341 }, "month": { - "min": 2, + "min": 0, "max": 12, - "mean": 6.277777777777778 + "mean": 6.108108108108108 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 3, - "mean": 0.2328767123287671 + "mean": 0.22666666666666666 }, "week": { "min": 0, "max": 6, - "mean": 1.624203821656051 + "mean": 1.5838509316770186 }, "month": { - "min": 2, + "min": 0, "max": 14, - "mean": 7.083333333333333 + "mean": 6.891891891891892 } }, "PROPOSITION": { "day": { "min": 0, "max": 4, - "mean": 0.34885844748858447 + "mean": 0.33955555555555555 }, "week": { "min": 0, "max": 8, - "mean": 2.43312101910828 + "mean": 2.372670807453416 }, "month": { - "min": 4, + "min": 0, "max": 17, - "mean": 10.61111111111111 + "mean": 10.324324324324325 } } }, "diabetes mellitus": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 58, + "mean": 16.045333333333332 + }, + "week": { + "min": 0, + "max": 183, + "mean": 112.11801242236025 + }, + "month": { + "min": 6, + "max": 662, + "mean": 487.86486486486484 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 5, - "mean": 0.7497725204731575 + "mean": 0.7315555555555555 }, "week": { - "min": 1, + "min": 0, "max": 11, - "mean": 5.248407643312102 + "mean": 5.111801242236025 }, "month": { - "min": 3, + "min": 1, "max": 31, - "mean": 22.27027027027027 + "mean": 22.243243243243242 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 5, - "mean": 0.7497725204731575 + "mean": 0.7315555555555555 }, "week": { - "min": 1, + "min": 0, "max": 11, - "mean": 5.248407643312102 + "mean": 5.111801242236025 }, "month": { - "min": 3, + "min": 1, "max": 31, - "mean": 22.27027027027027 + "mean": 22.243243243243242 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 8, - "mean": 1.0454959053685169 + "mean": 1.0222222222222221 }, "week": { - "min": 2, + "min": 0, "max": 15, - "mean": 7.318471337579618 + "mean": 7.142857142857143 }, "month": { - "min": 3, + "min": 1, "max": 44, - "mean": 31.054054054054053 + "mean": 31.08108108108108 } }, "PROPOSITION": { "day": { "min": 0, - "max": 10, - "mean": 1.4249317561419472 + "max": 11, + "mean": 1.392 }, "week": { - "min": 3, - "max": 20, - "mean": 9.97452229299363 + "min": 0, + "max": 19, + "mean": 9.726708074534162 }, "month": { - "min": 5, + "min": 2, "max": 57, "mean": 42.32432432432432 } } }, "neoplasms": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 311, + "mean": 58.556444444444445 + }, + "week": { + "min": 0, + "max": 598, + "mean": 409.167701863354 + }, + "month": { + "min": 35, + "max": 2108, + "mean": 1780.4324324324325 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 16, - "mean": 2.692447679708826 + "mean": 2.6266666666666665 }, "week": { - "min": 2, + "min": 0, "max": 33, - "mean": 18.727848101265824 + "mean": 18.354037267080745 }, "month": { - "min": 14, + "min": 4, "max": 101, - "mean": 79.97297297297297 + "mean": 79.86486486486487 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 16, - "mean": 2.692447679708826 + "mean": 2.6266666666666665 }, "week": { - "min": 2, + "min": 0, "max": 33, - "mean": 18.727848101265824 + "mean": 18.354037267080745 }, "month": { - "min": 14, + "min": 4, "max": 101, - "mean": 79.97297297297297 + "mean": 79.86486486486487 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 16, - "mean": 2.735213830755232 + "mean": 2.6684444444444444 }, "week": { - "min": 2, + "min": 0, "max": 33, - "mean": 19.025316455696203 + "mean": 18.645962732919255 }, "month": { - "min": 14, + "min": 4, "max": 101, - "mean": 81.24324324324324 + "mean": 81.13513513513513 } }, "PROPOSITION": { "day": { "min": 0, "max": 21, - "mean": 3.5281818181818183 + "mean": 3.447111111111111 }, "week": { - "min": 2, + "min": 0, "max": 43, - "mean": 24.563291139240505 + "mean": 24.08695652173913 }, "month": { - "min": 17, + "min": 4, "max": 127, - "mean": 104.89189189189189 + "mean": 104.8108108108108 } } }, "respiratory tract diseases": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 7, + "mean": 0.7795555555555556 + }, + "week": { + "min": 0, + "max": 13, + "mean": 5.447204968944099 + }, + "month": { + "min": 3, + "max": 41, + "mean": 23.7027027027027 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 1, - "mean": 0.02281021897810219 + "mean": 0.021333333333333333 }, "week": { "min": 0, "max": 2, - "mean": 0.15822784810126583 + "mean": 0.14906832298136646 }, "month": { "min": 0, "max": 3, - "mean": 0.6756756756756757 + "mean": 0.6486486486486487 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 1, - "mean": 0.02281021897810219 + "mean": 0.021333333333333333 }, "week": { "min": 0, "max": 2, - "mean": 0.15822784810126583 + "mean": 0.14906832298136646 }, "month": { "min": 0, "max": 3, - "mean": 0.6756756756756757 + "mean": 0.6486486486486487 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 1, - "mean": 0.02281021897810219 + "mean": 0.021333333333333333 }, "week": { "min": 0, "max": 2, - "mean": 0.15822784810126583 + "mean": 0.14906832298136646 }, "month": { "min": 0, "max": 3, - "mean": 0.6756756756756757 + "mean": 0.6486486486486487 } }, "PROPOSITION": { "day": { "min": 0, "max": 1, - "mean": 0.0364963503649635 + "mean": 0.034666666666666665 }, "week": { "min": 0, "max": 2, - "mean": 0.25316455696202533 + "mean": 0.2422360248447205 }, "month": { "min": 0, "max": 3, - "mean": 1.0810810810810811 + "mean": 1.054054054054054 } } }, "cardiovascular diseases": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 119, + "mean": 21.612444444444446 + }, + "week": { + "min": 0, + "max": 260, + "mean": 151.01863354037266 + }, + "month": { + "min": 20, + "max": 790, + "mean": 657.1351351351351 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 5, - "mean": 0.6584699453551912 + "mean": 0.64 }, "week": { "min": 0, "max": 13, - "mean": 4.575949367088608 + "mean": 4.472049689440993 }, "month": { - "min": 3, + "min": 0, "max": 40, - "mean": 19.54054054054054 + "mean": 19.45945945945946 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 5, - "mean": 0.6584699453551912 + "mean": 0.64 }, "week": { "min": 0, "max": 13, - "mean": 4.575949367088608 + "mean": 4.472049689440993 }, "month": { - "min": 3, + "min": 0, "max": 40, - "mean": 19.54054054054054 + "mean": 19.45945945945946 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 5, - "mean": 0.6930783242258652 + "mean": 0.6737777777777778 }, "week": { "min": 0, "max": 13, - "mean": 4.8164556962025316 + "mean": 4.708074534161491 }, "month": { - "min": 3, + "min": 0, "max": 42, - "mean": 20.56756756756757 + "mean": 20.486486486486488 } }, "PROPOSITION": { "day": { "min": 0, "max": 8, - "mean": 1.1474067333939946 + "mean": 1.1173333333333333 }, "week": { - "min": 1, + "min": 0, "max": 24, - "mean": 7.981012658227848 + "mean": 7.807453416149069 }, "month": { - "min": 5, + "min": 1, "max": 58, - "mean": 34.08108108108108 + "mean": 33.972972972972975 } } }, "mental health": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 74, + "mean": 16.589333333333332 + }, + "week": { + "min": 0, + "max": 213, + "mean": 115.9192546583851 + }, + "month": { + "min": 8, + "max": 679, + "mean": 504.4054054054054 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 8, - "mean": 1.1856232939035487 + "mean": 1.1564444444444444 }, "week": { - "min": 1, + "min": 0, "max": 21, - "mean": 8.246835443037975 + "mean": 8.080745341614907 }, "month": { - "min": 4, + "min": 1, "max": 52, - "mean": 35.21621621621622 + "mean": 35.16216216216216 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 8, - "mean": 1.1856232939035487 + "mean": 1.1564444444444444 }, "week": { - "min": 1, + "min": 0, "max": 21, - "mean": 8.246835443037975 + "mean": 8.080745341614907 }, "month": { - "min": 4, + "min": 1, "max": 52, - "mean": 35.21621621621622 + "mean": 35.16216216216216 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 8, - "mean": 1.2438580527752503 + "mean": 1.2133333333333334 }, "week": { - "min": 1, + "min": 0, "max": 22, - "mean": 8.651898734177216 + "mean": 8.478260869565217 }, "month": { - "min": 4, + "min": 1, "max": 55, - "mean": 36.945945945945944 + "mean": 36.891891891891895 } }, "PROPOSITION": { "day": { "min": 0, "max": 11, - "mean": 2.1618181818181816 + "mean": 2.1093333333333333 }, "week": { - "min": 1, + "min": 0, "max": 32, - "mean": 15.050632911392405 + "mean": 14.73913043478261 }, "month": { - "min": 8, - "max": 84, - "mean": 64.27027027027027 + "min": 2, + "max": 85, + "mean": 64.13513513513513 } } }, "diabetes mellitus, type 1": { + "NO KEYWORDS": { + "day": { + "min": 0, + "max": 40, + "mean": 7.019555555555556 + }, + "week": { + "min": 0, + "max": 78, + "mean": 49.04968944099379 + }, + "month": { + "min": 2, + "max": 257, + "mean": 213.43243243243242 + } + }, "KEYWORDS": { "day": { "min": 0, "max": 3, - "mean": 0.3072014585232452 + "mean": 0.29688888888888887 }, "week": { "min": 0, "max": 8, - "mean": 2.132911392405063 + "mean": 2.0745341614906834 }, "month": { - "min": 3, + "min": 1, "max": 19, - "mean": 9.108108108108109 + "mean": 9.027027027027026 } }, "SUBHEADINGS": { "day": { "min": 0, "max": 3, - "mean": 0.3072014585232452 + "mean": 0.29688888888888887 }, "week": { "min": 0, "max": 8, - "mean": 2.132911392405063 + "mean": 2.0745341614906834 }, "month": { - "min": 3, + "min": 1, "max": 19, - "mean": 9.108108108108109 + "mean": 9.027027027027026 } }, "SITE PROPOSITION": { "day": { "min": 0, "max": 13, - "mean": 1.5141037306642402 + "mean": 1.4755555555555555 }, "week": { - "min": 3, + "min": 0, "max": 30, - "mean": 10.531645569620252 + "mean": 10.31055900621118 }, "month": { - "min": 5, + "min": 1, "max": 66, - "mean": 44.972972972972975 + "mean": 44.86486486486486 } }, "PROPOSITION": { "day": { "min": 0, "max": 13, - "mean": 1.5941765241128298 + "mean": 1.5537777777777777 }, "week": { - "min": 3, + "min": 0, "max": 30, - "mean": 11.08860759493671 + "mean": 10.857142857142858 }, "month": { - "min": 5, + "min": 1, "max": 68, - "mean": 47.351351351351354 + "mean": 47.24324324324324 } } }, "diabetes mellitus, type 2": { - "KEYWORDS": { + "NO KEYWORDS": { "day": { "min": 0, - "max": 7, - "mean": 0.9763421292083713 + "max": 146, + "mean": 28.133333333333333 }, "week": { "min": 0, - "max": 14, - "mean": 6.791139240506329 + "max": 282, + "mean": 196.583850931677 }, "month": { - "min": 6, - "max": 45, - "mean": 29 + "min": 26, + "max": 1021, + "mean": 855.4054054054054 } }, - "SUBHEADINGS": { + "KEYWORDS": { "day": { "min": 0, "max": 7, - "mean": 0.9763421292083713 + "mean": 0.952 }, "week": { "min": 0, "max": 14, - "mean": 6.791139240506329 + "mean": 6.6521739130434785 }, "month": { - "min": 6, + "min": 3, "max": 45, - "mean": 29 + "mean": 28.945945945945947 } }, - "SITE PROPOSITION": { + "SUBHEADINGS": { "day": { "min": 0, - "max": 10, - "mean": 1.520909090909091 + "max": 7, + "mean": 0.952 }, "week": { - "min": 1, - "max": 23, - "mean": 10.58860759493671 - }, - "month": { - "min": 8, - "max": 61, - "mean": 45.21621621621622 - } - }, - "PROPOSITION": { - "day": { "min": 0, "max": 14, - "mean": 2.099090909090909 - }, - "week": { - "min": 2, - "max": 28, - "mean": 14.613924050632912 - }, - "month": { - "min": 10, - "max": 84, - "mean": 62.4054054054054 - } - } - }, - "ALL": { - "KEYWORDS": { - "day": { - "min": 0, - "max": 39, - "mean": 6.54 - }, - "week": { - "min": 8, - "max": 69, - "mean": 45.53164556962025 - }, - "month": { - "min": 31, - "max": 262, - "mean": 194.43243243243242 - } - }, - "SUBHEADINGS": { - "day": { - "min": 0, - "max": 39, - "mean": 6.54 - }, - "week": { - "min": 8, - "max": 69, - "mean": 45.53164556962025 + "mean": 6.6521739130434785 }, "month": { - "min": 31, - "max": 262, - "mean": 194.43243243243242 + "min": 3, + "max": 45, + "mean": 28.945945945945947 } }, "SITE PROPOSITION": { "day": { "min": 0, - "max": 47, - "mean": 8.478181818181818 + "max": 10, + "mean": 1.4844444444444445 }, "week": { - "min": 8, - "max": 93, - "mean": 59.0253164556962 + "min": 0, + "max": 23, + "mean": 10.372670807453416 }, "month": { - "min": 34, - "max": 310, - "mean": 252.05405405405406 + "min": 4, + "max": 62, + "mean": 45.13513513513514 } }, "PROPOSITION": { "day": { "min": 0, - "max": 67, - "mean": 11.658181818181818 + "max": 14, + "mean": 2.049777777777778 }, "week": { - "min": 9, - "max": 124, - "mean": 81.16455696202532 + "min": 0, + "max": 28, + "mean": 14.322981366459627 }, "month": { - "min": 46, - "max": 431, - "mean": 346.5945945945946 + "min": 5, + "max": 84, + "mean": 62.32432432432432 } } } diff --git a/dataSources/PubMed/data_num_locale.py b/dataSources/PubMed/data_num_locale.py index 35aad2e52..48c0c2f8c 100644 --- a/dataSources/PubMed/data_num_locale.py +++ b/dataSources/PubMed/data_num_locale.py @@ -8,23 +8,6 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../" from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION -CATEGORIES = [ - "KEYWORDS", - "SUBHEADINGS", - "SITE PROPOSITION", - "PROPOSITION" -] - -INTERVALS = [ - "day", - "week", - "month" -] - -DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) - -file_path = f"{DATA_DIR}/save_3_years.json" - def lower_keywords(mesh_terms): res = [] @@ -47,33 +30,6 @@ def get_date_indices(date, start_date): return day_index, week_index, month_index -def match_mesh_terms(article_mesh_terms, ncd, keyword): - if ncd in article_mesh_terms: - if isinstance(keyword, list): - all_in = True - for k in keyword: - if k not in article_mesh_terms: - all_in = False - - return all_in - else: - return keyword in article_mesh_terms - else: - return False - -def init_index(category, counts, ncd, article_date): - start_date = datetime(2022, 1, 1) - day_index, week_index, month_index = get_date_indices(article_date, start_date) - - if day_index not in counts[ncd][category]["day"]: - counts[ncd][category]["day"][day_index] = [] - - if week_index not in counts[ncd][category]["week"]: - counts[ncd][category]["week"][week_index] = [] - - if month_index not in counts[ncd][category]["month"]: - counts[ncd][category]["month"][month_index] = [] - def add_article(article, category, counts, ncd, article_date): start_date = datetime(2022, 1, 1) day_index, week_index, month_index = get_date_indices(article_date, start_date) @@ -87,6 +43,35 @@ def add_article(article, category, counts, ncd, article_date): if article["PMID"] not in counts[ncd][category]["month"][month_index]: counts[ncd][category]["month"][month_index].append(article["PMID"]) +def mesh_term_present(article_mesh_terms, mesh_term): + if isinstance(mesh_term, list): + all_in = True + for part in mesh_term: + if part not in article_mesh_terms: + all_in = False + + return all_in + else: + return mesh_term in article_mesh_terms + +CATEGORIES = [ + "NO KEYWORDS", + "KEYWORDS", + "SUBHEADINGS", + "SITE PROPOSITION", + "PROPOSITION" +] + +INTERVALS = [ + "day", + "week", + "month" +] + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) +TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./tmp")) + +file_path = f"{TMP_DIR}/save_3_years.json" with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) @@ -101,69 +86,23 @@ keywords_proposition_mesh_terms = lower_keywords(KEYWORDS_MESH_PROPOSITION) counts = {} -for ncd in ncds_mesh_terms: - counts[ncd] = { - "KEYWORDS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SUBHEADINGS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SITE PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - "PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - } - - start_date = datetime(2022, 1, 1) - end_date = datetime(2024, 12, 31) - current_date = start_date +counts["ALL"] = {} - while(current_date < end_date): - day_index, week_index, month_index = get_date_indices(current_date, start_date) +for category in CATEGORIES: + counts["ALL"][category] = {} + for interval in INTERVALS: + counts["ALL"][category][interval] = {} - for category in CATEGORIES: - counts[ncd][category]["day"][day_index] = [] - counts[ncd][category]["week"][week_index] = [] - counts[ncd][category]["month"][month_index] = [] +for ncd in ncds_mesh_terms: + counts[ncd] = {} + for category in CATEGORIES: + counts[ncd][category] = {} + for interval in INTERVALS: + counts[ncd][category][interval] = {} - current_date += timedelta(days=1) - -counts["ALL"] = { - "KEYWORDS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SUBHEADINGS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SITE PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - "PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, -} start_date = datetime(2022, 1, 1) -end_date = datetime(2024, 12, 31) +end_date = datetime(2025, 1, 30) current_date = start_date while(current_date < end_date): @@ -174,134 +113,80 @@ while(current_date < end_date): counts["ALL"][category]["week"][week_index] = [] counts["ALL"][category]["month"][month_index] = [] + for ncd in ncds_mesh_terms: + for category in CATEGORIES: + counts[ncd][category]["day"][day_index] = [] + counts[ncd][category]["week"][week_index] = [] + counts[ncd][category]["month"][month_index] = [] + current_date += timedelta(days=1) for article in data: - mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] + article_mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) - if "ALL" not in counts: - counts["ALL"] = { - "KEYWORDS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SUBHEADINGS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SITE PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - "PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - } - for ncd in ncds_mesh_terms: - if ncd not in counts: - counts[ncd] = { - "KEYWORDS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SUBHEADINGS" : { - "day": {}, - "week": {}, - "month": {} - }, - "SITE PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - "PROPOSITION" : { - "day": {}, - "week": {}, - "month": {} - }, - } - - for keyword in keywords_mesh_terms: - - if match_mesh_terms(mesh_terms, ncd, keyword): - init_index("KEYWORDS", counts, ncd, article_date) - init_index("SUBHEADINGS", counts, ncd, article_date) - init_index("SITE PROPOSITION", counts, ncd, article_date) - init_index("PROPOSITION", counts, ncd, article_date) - add_article(article, "KEYWORDS", counts, ncd, article_date) - add_article(article, "SUBHEADINGS", counts, ncd, article_date) - add_article(article, "SITE PROPOSITION", counts, ncd, article_date) - add_article(article, "PROPOSITION", counts, ncd, article_date) + if mesh_term_present(article_mesh_terms, ncd): - init_index("KEYWORDS", counts, "ALL", article_date) - init_index("SUBHEADINGS", counts, "ALL", article_date) - init_index("SITE PROPOSITION", counts, "ALL", article_date) - init_index("PROPOSITION", counts, "ALL", article_date) + add_article(article, CATEGORIES[0], counts, ncd, article_date) + add_article(article, CATEGORIES[0], counts, "ALL", article_date) - add_article(article, "KEYWORDS", counts, "ALL", article_date) - add_article(article, "SUBHEADINGS", counts, "ALL", article_date) - add_article(article, "SITE PROPOSITION", counts, "ALL", article_date) - add_article(article, "PROPOSITION", counts, "ALL", article_date) + added = False - for keyword in keywords_subheading_mesh_terms: + for keyword in keywords_mesh_terms: + if added: + break - if match_mesh_terms(mesh_terms, ncd, keyword): - init_index("SUBHEADINGS", counts, ncd, article_date) - init_index("SITE PROPOSITION", counts, ncd, article_date) - init_index("PROPOSITION", counts, ncd, article_date) + if mesh_term_present(article_mesh_terms, keyword): - add_article(article, "SUBHEADINGS", counts, ncd, article_date) - add_article(article, "SITE PROPOSITION", counts, ncd, article_date) - add_article(article, "PROPOSITION", counts, ncd, article_date) + for category in CATEGORIES[1:]: + add_article(article, category, counts, ncd, article_date) + add_article(article, category, counts, "ALL", article_date) - init_index("SUBHEADINGS", counts, "ALL", article_date) - init_index("SITE PROPOSITION", counts, "ALL", article_date) - init_index("PROPOSITION", counts, "ALL", article_date) + added = True - add_article(article, "SUBHEADINGS", counts, "ALL", article_date) - add_article(article, "SITE PROPOSITION", counts, "ALL", article_date) - add_article(article, "PROPOSITION", counts, "ALL", article_date) + for keyword in keywords_subheading_mesh_terms: + if added: + break - for keyword in keywords_site_proposition_mesh_terms: + if mesh_term_present(article_mesh_terms, keyword): - if match_mesh_terms(mesh_terms, ncd, keyword): - init_index("SITE PROPOSITION", counts, ncd, article_date) - init_index("PROPOSITION", counts, ncd, article_date) + for category in CATEGORIES[2:]: + add_article(article, category, counts, ncd, article_date) + add_article(article, category, counts, "ALL", article_date) + + added = True - add_article(article, "SITE PROPOSITION", counts, ncd, article_date) - add_article(article, "PROPOSITION", counts, ncd, article_date) + for keyword in keywords_site_proposition_mesh_terms: + if added: + break - init_index("SITE PROPOSITION", counts, "ALL", article_date) - init_index("PROPOSITION", counts, "ALL", article_date) + if mesh_term_present(article_mesh_terms, keyword): - add_article(article, "SITE PROPOSITION", counts, "ALL", article_date) - add_article(article, "PROPOSITION", counts, "ALL", article_date) + for category in CATEGORIES[3:]: + add_article(article, category, counts, ncd, article_date) + add_article(article, category, counts, "ALL", article_date) - for keyword in keywords_proposition_mesh_terms: + added = True - if match_mesh_terms(mesh_terms, ncd, keyword): - init_index("PROPOSITION", counts, ncd, article_date) + for keyword in keywords_proposition_mesh_terms: + if added: + break - add_article(article, "PROPOSITION", counts, ncd, article_date) + if mesh_term_present(article_mesh_terms, keyword): - init_index("PROPOSITION", counts, "ALL", article_date) + for category in CATEGORIES[4:]: + add_article(article, category, counts, ncd, article_date) + add_article(article, category, counts, "ALL", article_date) - add_article(article, "PROPOSITION", counts, "ALL", article_date) + added = True for ncd in ncds_mesh_terms: for category in CATEGORIES: for interval in INTERVALS: - counts[ncd][category][interval] = [len(tmp) for key, tmp in counts[ncd][category][interval].items()] + counts[ncd][category][interval] = [len(tmp) for _, tmp in counts[ncd][category][interval].items()] counts[ncd][category][interval] = { "min": min(counts[ncd][category][interval]), diff --git a/dataSources/PubMed/getPubmedData.py b/dataSources/PubMed/getPubmedData.py index e69de29bb..0ebb541c0 100644 --- a/dataSources/PubMed/getPubmedData.py +++ b/dataSources/PubMed/getPubmedData.py @@ -0,0 +1,15 @@ +import sys +import os +import json + +# Ajouter le répertoire parent au chemin de recherche +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))) + +TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./tmp")) + +data = [] + +with open(f"{TMP_DIR}/save_3_years.json", "r", encoding="utf-8") as file: + data = json.load(file) + +print(len(data)) \ No newline at end of file diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py index 28c194590..c0b969794 100644 --- a/dataSources/PubMed/pubmedApi.py +++ b/dataSources/PubMed/pubmedApi.py @@ -62,6 +62,9 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store # obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") obj = xmltodict.parse(response.text) + if "PubmedArticleSet" not in obj: + return [] + obj = obj["PubmedArticleSet"] print() diff --git a/dataSources/PubMed/store_data_localy.py b/dataSources/PubMed/store_data_localy.py index 718ff1b6f..64c3d2cff 100644 --- a/dataSources/PubMed/store_data_localy.py +++ b/dataSources/PubMed/store_data_localy.py @@ -11,27 +11,38 @@ from dataSources.PubMed.pubmedApi import getPubmedData from variables.pubmed import * from dataSources.PubMed.util import * -DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) +TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./tmp")) ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM) search_term = url_encode(" OR ".join(ncds_mesh_noexp)) -data = [] +data_to_store = [] -with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: - json.dump(data, json_file, indent=4) +with open(f"{TMP_DIR}/save_3_years.json", "w") as json_file: + json.dump(data_to_store, json_file, indent=4) current_date = datetime(2022, 1, 1) +stored_pmid = [] + while(current_date < datetime(2024, 12, 31)): - next_date = current_date + timedelta(weeks=1) + next_date = current_date + timedelta(days=4) + + data = getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d")) + + if len(data) > 10000: + print("ERROR: MORE THAN 10000 ARTICLES") + exit(1) - data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d")) + for article in data: + if article["PMID"] not in stored_pmid: + data_to_store.append(article) + stored_pmid.append(article["PMID"]) current_date = next_date time.sleep(0.1) -with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: - json.dump(data, json_file, indent=4) \ No newline at end of file +with open(f"{TMP_DIR}/save_3_years.json", "w") as json_file: + json.dump(data_to_store, json_file, indent=4) \ No newline at end of file -- GitLab