Skip to content
Snippets Groups Projects
Commit 623e3f6f authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Regeneration de la sauvegarde locale en enlevant les doublons et en ajoutant...

Regeneration de la sauvegarde locale en enlevant les doublons et en ajoutant les potentiels articles loupés. Regeration du calcule d'articles postés dans pubmed avec le champ SANS KEYWORDS
parent 02f90f52
No related branches found
No related tags found
No related merge requests found
dataSources/PubMed/tmp/* dataSources/PubMed/tmp/*
dataSources/PubMed/data/*
.venv .venv
\ No newline at end of file
No preview for this file type
{ {
"ALL": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 700,
"mean": 142.64
},
"week": {
"min": 0,
"max": 1436,
"mean": 996.7080745341615
},
"month": {
"min": 95,
"max": 5131,
"mean": 4337.027027027027
}
},
"KEYWORDS": {
"day": {
"min": 0,
"max": 39,
"mean": 6.383111111111111
},
"week": {
"min": 0,
"max": 69,
"mean": 44.60248447204969
},
"month": {
"min": 9,
"max": 263,
"mean": 194.0810810810811
}
},
"SUBHEADINGS": {
"day": {
"min": 0,
"max": 39,
"mean": 6.383111111111111
},
"week": {
"min": 0,
"max": 69,
"mean": 44.60248447204969
},
"month": {
"min": 9,
"max": 263,
"mean": 194.0810810810811
}
},
"SITE PROPOSITION": {
"day": {
"min": 0,
"max": 47,
"mean": 8.278222222222222
},
"week": {
"min": 0,
"max": 93,
"mean": 57.84472049689441
},
"month": {
"min": 10,
"max": 311,
"mean": 251.7027027027027
}
},
"PROPOSITION": {
"day": {
"min": 0,
"max": 68,
"mean": 11.38488888888889
},
"week": {
"min": 0,
"max": 124,
"mean": 79.5527950310559
},
"month": {
"min": 14,
"max": 432,
"mean": 346.1621621621622
}
}
},
"noncommunicable diseases": { "noncommunicable diseases": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 7,
"mean": 1.3555555555555556
},
"week": {
"min": 0,
"max": 23,
"mean": 9.472049689440993
},
"month": {
"min": 0,
"max": 57,
"mean": 41.21621621621622
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.20639269406392693 "mean": 0.2008888888888889
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 6, "max": 6,
"mean": 1.4394904458598725 "mean": 1.4037267080745341
}, },
"month": { "month": {
"min": 2, "min": 0,
"max": 12, "max": 12,
"mean": 6.277777777777778 "mean": 6.108108108108108
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.20639269406392693 "mean": 0.2008888888888889
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 6, "max": 6,
"mean": 1.4394904458598725 "mean": 1.4037267080745341
}, },
"month": { "month": {
"min": 2, "min": 0,
"max": 12, "max": 12,
"mean": 6.277777777777778 "mean": 6.108108108108108
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.2328767123287671 "mean": 0.22666666666666666
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 6, "max": 6,
"mean": 1.624203821656051 "mean": 1.5838509316770186
}, },
"month": { "month": {
"min": 2, "min": 0,
"max": 14, "max": 14,
"mean": 7.083333333333333 "mean": 6.891891891891892
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 4, "max": 4,
"mean": 0.34885844748858447 "mean": 0.33955555555555555
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 2.43312101910828 "mean": 2.372670807453416
}, },
"month": { "month": {
"min": 4, "min": 0,
"max": 17, "max": 17,
"mean": 10.61111111111111 "mean": 10.324324324324325
} }
} }
}, },
"diabetes mellitus": { "diabetes mellitus": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 58,
"mean": 16.045333333333332
},
"week": {
"min": 0,
"max": 183,
"mean": 112.11801242236025
},
"month": {
"min": 6,
"max": 662,
"mean": 487.86486486486484
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 5, "max": 5,
"mean": 0.7497725204731575 "mean": 0.7315555555555555
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 11, "max": 11,
"mean": 5.248407643312102 "mean": 5.111801242236025
}, },
"month": { "month": {
"min": 3, "min": 1,
"max": 31, "max": 31,
"mean": 22.27027027027027 "mean": 22.243243243243242
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 5, "max": 5,
"mean": 0.7497725204731575 "mean": 0.7315555555555555
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 11, "max": 11,
"mean": 5.248407643312102 "mean": 5.111801242236025
}, },
"month": { "month": {
"min": 3, "min": 1,
"max": 31, "max": 31,
"mean": 22.27027027027027 "mean": 22.243243243243242
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 1.0454959053685169 "mean": 1.0222222222222221
}, },
"week": { "week": {
"min": 2, "min": 0,
"max": 15, "max": 15,
"mean": 7.318471337579618 "mean": 7.142857142857143
}, },
"month": { "month": {
"min": 3, "min": 1,
"max": 44, "max": 44,
"mean": 31.054054054054053 "mean": 31.08108108108108
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 10, "max": 11,
"mean": 1.4249317561419472 "mean": 1.392
}, },
"week": { "week": {
"min": 3, "min": 0,
"max": 20, "max": 19,
"mean": 9.97452229299363 "mean": 9.726708074534162
}, },
"month": { "month": {
"min": 5, "min": 2,
"max": 57, "max": 57,
"mean": 42.32432432432432 "mean": 42.32432432432432
} }
} }
}, },
"neoplasms": { "neoplasms": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 311,
"mean": 58.556444444444445
},
"week": {
"min": 0,
"max": 598,
"mean": 409.167701863354
},
"month": {
"min": 35,
"max": 2108,
"mean": 1780.4324324324325
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 16, "max": 16,
"mean": 2.692447679708826 "mean": 2.6266666666666665
}, },
"week": { "week": {
"min": 2, "min": 0,
"max": 33, "max": 33,
"mean": 18.727848101265824 "mean": 18.354037267080745
}, },
"month": { "month": {
"min": 14, "min": 4,
"max": 101, "max": 101,
"mean": 79.97297297297297 "mean": 79.86486486486487
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 16, "max": 16,
"mean": 2.692447679708826 "mean": 2.6266666666666665
}, },
"week": { "week": {
"min": 2, "min": 0,
"max": 33, "max": 33,
"mean": 18.727848101265824 "mean": 18.354037267080745
}, },
"month": { "month": {
"min": 14, "min": 4,
"max": 101, "max": 101,
"mean": 79.97297297297297 "mean": 79.86486486486487
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 16, "max": 16,
"mean": 2.735213830755232 "mean": 2.6684444444444444
}, },
"week": { "week": {
"min": 2, "min": 0,
"max": 33, "max": 33,
"mean": 19.025316455696203 "mean": 18.645962732919255
}, },
"month": { "month": {
"min": 14, "min": 4,
"max": 101, "max": 101,
"mean": 81.24324324324324 "mean": 81.13513513513513
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 21, "max": 21,
"mean": 3.5281818181818183 "mean": 3.447111111111111
}, },
"week": { "week": {
"min": 2, "min": 0,
"max": 43, "max": 43,
"mean": 24.563291139240505 "mean": 24.08695652173913
}, },
"month": { "month": {
"min": 17, "min": 4,
"max": 127, "max": 127,
"mean": 104.89189189189189 "mean": 104.8108108108108
} }
} }
}, },
"respiratory tract diseases": { "respiratory tract diseases": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 7,
"mean": 0.7795555555555556
},
"week": {
"min": 0,
"max": 13,
"mean": 5.447204968944099
},
"month": {
"min": 3,
"max": 41,
"mean": 23.7027027027027
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 1, "max": 1,
"mean": 0.02281021897810219 "mean": 0.021333333333333333
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 2, "max": 2,
"mean": 0.15822784810126583 "mean": 0.14906832298136646
}, },
"month": { "month": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.6756756756756757 "mean": 0.6486486486486487
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 1, "max": 1,
"mean": 0.02281021897810219 "mean": 0.021333333333333333
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 2, "max": 2,
"mean": 0.15822784810126583 "mean": 0.14906832298136646
}, },
"month": { "month": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.6756756756756757 "mean": 0.6486486486486487
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 1, "max": 1,
"mean": 0.02281021897810219 "mean": 0.021333333333333333
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 2, "max": 2,
"mean": 0.15822784810126583 "mean": 0.14906832298136646
}, },
"month": { "month": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.6756756756756757 "mean": 0.6486486486486487
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 1, "max": 1,
"mean": 0.0364963503649635 "mean": 0.034666666666666665
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 2, "max": 2,
"mean": 0.25316455696202533 "mean": 0.2422360248447205
}, },
"month": { "month": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 1.0810810810810811 "mean": 1.054054054054054
} }
} }
}, },
"cardiovascular diseases": { "cardiovascular diseases": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 119,
"mean": 21.612444444444446
},
"week": {
"min": 0,
"max": 260,
"mean": 151.01863354037266
},
"month": {
"min": 20,
"max": 790,
"mean": 657.1351351351351
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 5, "max": 5,
"mean": 0.6584699453551912 "mean": 0.64
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 13, "max": 13,
"mean": 4.575949367088608 "mean": 4.472049689440993
}, },
"month": { "month": {
"min": 3, "min": 0,
"max": 40, "max": 40,
"mean": 19.54054054054054 "mean": 19.45945945945946
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 5, "max": 5,
"mean": 0.6584699453551912 "mean": 0.64
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 13, "max": 13,
"mean": 4.575949367088608 "mean": 4.472049689440993
}, },
"month": { "month": {
"min": 3, "min": 0,
"max": 40, "max": 40,
"mean": 19.54054054054054 "mean": 19.45945945945946
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 5, "max": 5,
"mean": 0.6930783242258652 "mean": 0.6737777777777778
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 13, "max": 13,
"mean": 4.8164556962025316 "mean": 4.708074534161491
}, },
"month": { "month": {
"min": 3, "min": 0,
"max": 42, "max": 42,
"mean": 20.56756756756757 "mean": 20.486486486486488
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 1.1474067333939946 "mean": 1.1173333333333333
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 24, "max": 24,
"mean": 7.981012658227848 "mean": 7.807453416149069
}, },
"month": { "month": {
"min": 5, "min": 1,
"max": 58, "max": 58,
"mean": 34.08108108108108 "mean": 33.972972972972975
} }
} }
}, },
"mental health": { "mental health": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 74,
"mean": 16.589333333333332
},
"week": {
"min": 0,
"max": 213,
"mean": 115.9192546583851
},
"month": {
"min": 8,
"max": 679,
"mean": 504.4054054054054
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 1.1856232939035487 "mean": 1.1564444444444444
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 21, "max": 21,
"mean": 8.246835443037975 "mean": 8.080745341614907
}, },
"month": { "month": {
"min": 4, "min": 1,
"max": 52, "max": 52,
"mean": 35.21621621621622 "mean": 35.16216216216216
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 1.1856232939035487 "mean": 1.1564444444444444
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 21, "max": 21,
"mean": 8.246835443037975 "mean": 8.080745341614907
}, },
"month": { "month": {
"min": 4, "min": 1,
"max": 52, "max": 52,
"mean": 35.21621621621622 "mean": 35.16216216216216
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 1.2438580527752503 "mean": 1.2133333333333334
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 22, "max": 22,
"mean": 8.651898734177216 "mean": 8.478260869565217
}, },
"month": { "month": {
"min": 4, "min": 1,
"max": 55, "max": 55,
"mean": 36.945945945945944 "mean": 36.891891891891895
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 11, "max": 11,
"mean": 2.1618181818181816 "mean": 2.1093333333333333
}, },
"week": { "week": {
"min": 1, "min": 0,
"max": 32, "max": 32,
"mean": 15.050632911392405 "mean": 14.73913043478261
}, },
"month": { "month": {
"min": 8, "min": 2,
"max": 84, "max": 85,
"mean": 64.27027027027027 "mean": 64.13513513513513
} }
} }
}, },
"diabetes mellitus, type 1": { "diabetes mellitus, type 1": {
"NO KEYWORDS": {
"day": {
"min": 0,
"max": 40,
"mean": 7.019555555555556
},
"week": {
"min": 0,
"max": 78,
"mean": 49.04968944099379
},
"month": {
"min": 2,
"max": 257,
"mean": 213.43243243243242
}
},
"KEYWORDS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.3072014585232452 "mean": 0.29688888888888887
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 2.132911392405063 "mean": 2.0745341614906834
}, },
"month": { "month": {
"min": 3, "min": 1,
"max": 19, "max": 19,
"mean": 9.108108108108109 "mean": 9.027027027027026
} }
}, },
"SUBHEADINGS": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 3, "max": 3,
"mean": 0.3072014585232452 "mean": 0.29688888888888887
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 8, "max": 8,
"mean": 2.132911392405063 "mean": 2.0745341614906834
}, },
"month": { "month": {
"min": 3, "min": 1,
"max": 19, "max": 19,
"mean": 9.108108108108109 "mean": 9.027027027027026
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 13, "max": 13,
"mean": 1.5141037306642402 "mean": 1.4755555555555555
}, },
"week": { "week": {
"min": 3, "min": 0,
"max": 30, "max": 30,
"mean": 10.531645569620252 "mean": 10.31055900621118
}, },
"month": { "month": {
"min": 5, "min": 1,
"max": 66, "max": 66,
"mean": 44.972972972972975 "mean": 44.86486486486486
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 13, "max": 13,
"mean": 1.5941765241128298 "mean": 1.5537777777777777
}, },
"week": { "week": {
"min": 3, "min": 0,
"max": 30, "max": 30,
"mean": 11.08860759493671 "mean": 10.857142857142858
}, },
"month": { "month": {
"min": 5, "min": 1,
"max": 68, "max": 68,
"mean": 47.351351351351354 "mean": 47.24324324324324
} }
} }
}, },
"diabetes mellitus, type 2": { "diabetes mellitus, type 2": {
"KEYWORDS": { "NO KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 7, "max": 146,
"mean": 0.9763421292083713 "mean": 28.133333333333333
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 14, "max": 282,
"mean": 6.791139240506329 "mean": 196.583850931677
}, },
"month": { "month": {
"min": 6, "min": 26,
"max": 45, "max": 1021,
"mean": 29 "mean": 855.4054054054054
} }
}, },
"SUBHEADINGS": { "KEYWORDS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 7, "max": 7,
"mean": 0.9763421292083713 "mean": 0.952
}, },
"week": { "week": {
"min": 0, "min": 0,
"max": 14, "max": 14,
"mean": 6.791139240506329 "mean": 6.6521739130434785
}, },
"month": { "month": {
"min": 6, "min": 3,
"max": 45, "max": 45,
"mean": 29 "mean": 28.945945945945947
} }
}, },
"SITE PROPOSITION": { "SUBHEADINGS": {
"day": { "day": {
"min": 0, "min": 0,
"max": 10, "max": 7,
"mean": 1.520909090909091 "mean": 0.952
}, },
"week": { "week": {
"min": 1,
"max": 23,
"mean": 10.58860759493671
},
"month": {
"min": 8,
"max": 61,
"mean": 45.21621621621622
}
},
"PROPOSITION": {
"day": {
"min": 0, "min": 0,
"max": 14, "max": 14,
"mean": 2.099090909090909 "mean": 6.6521739130434785
},
"week": {
"min": 2,
"max": 28,
"mean": 14.613924050632912
},
"month": {
"min": 10,
"max": 84,
"mean": 62.4054054054054
}
}
},
"ALL": {
"KEYWORDS": {
"day": {
"min": 0,
"max": 39,
"mean": 6.54
},
"week": {
"min": 8,
"max": 69,
"mean": 45.53164556962025
},
"month": {
"min": 31,
"max": 262,
"mean": 194.43243243243242
}
},
"SUBHEADINGS": {
"day": {
"min": 0,
"max": 39,
"mean": 6.54
},
"week": {
"min": 8,
"max": 69,
"mean": 45.53164556962025
}, },
"month": { "month": {
"min": 31, "min": 3,
"max": 262, "max": 45,
"mean": 194.43243243243242 "mean": 28.945945945945947
} }
}, },
"SITE PROPOSITION": { "SITE PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 47, "max": 10,
"mean": 8.478181818181818 "mean": 1.4844444444444445
}, },
"week": { "week": {
"min": 8, "min": 0,
"max": 93, "max": 23,
"mean": 59.0253164556962 "mean": 10.372670807453416
}, },
"month": { "month": {
"min": 34, "min": 4,
"max": 310, "max": 62,
"mean": 252.05405405405406 "mean": 45.13513513513514
} }
}, },
"PROPOSITION": { "PROPOSITION": {
"day": { "day": {
"min": 0, "min": 0,
"max": 67, "max": 14,
"mean": 11.658181818181818 "mean": 2.049777777777778
}, },
"week": { "week": {
"min": 9, "min": 0,
"max": 124, "max": 28,
"mean": 81.16455696202532 "mean": 14.322981366459627
}, },
"month": { "month": {
"min": 46, "min": 5,
"max": 431, "max": 84,
"mean": 346.5945945945946 "mean": 62.32432432432432
} }
} }
} }
......
...@@ -8,23 +8,6 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../" ...@@ -8,23 +8,6 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"
from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION from variables.pubmed import NCDS_MESH_TERM, KEYWORDS_MESH_TERM, KEYWORDS_MESH_SUBHEADING, KEYWORDS_MESH_SITE_PROPOSITION, KEYWORDS_MESH_PROPOSITION
CATEGORIES = [
"KEYWORDS",
"SUBHEADINGS",
"SITE PROPOSITION",
"PROPOSITION"
]
INTERVALS = [
"day",
"week",
"month"
]
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
file_path = f"{DATA_DIR}/save_3_years.json"
def lower_keywords(mesh_terms): def lower_keywords(mesh_terms):
res = [] res = []
...@@ -47,33 +30,6 @@ def get_date_indices(date, start_date): ...@@ -47,33 +30,6 @@ def get_date_indices(date, start_date):
return day_index, week_index, month_index return day_index, week_index, month_index
def match_mesh_terms(article_mesh_terms, ncd, keyword):
if ncd in article_mesh_terms:
if isinstance(keyword, list):
all_in = True
for k in keyword:
if k not in article_mesh_terms:
all_in = False
return all_in
else:
return keyword in article_mesh_terms
else:
return False
def init_index(category, counts, ncd, article_date):
start_date = datetime(2022, 1, 1)
day_index, week_index, month_index = get_date_indices(article_date, start_date)
if day_index not in counts[ncd][category]["day"]:
counts[ncd][category]["day"][day_index] = []
if week_index not in counts[ncd][category]["week"]:
counts[ncd][category]["week"][week_index] = []
if month_index not in counts[ncd][category]["month"]:
counts[ncd][category]["month"][month_index] = []
def add_article(article, category, counts, ncd, article_date): def add_article(article, category, counts, ncd, article_date):
start_date = datetime(2022, 1, 1) start_date = datetime(2022, 1, 1)
day_index, week_index, month_index = get_date_indices(article_date, start_date) day_index, week_index, month_index = get_date_indices(article_date, start_date)
...@@ -87,6 +43,35 @@ def add_article(article, category, counts, ncd, article_date): ...@@ -87,6 +43,35 @@ def add_article(article, category, counts, ncd, article_date):
if article["PMID"] not in counts[ncd][category]["month"][month_index]: if article["PMID"] not in counts[ncd][category]["month"][month_index]:
counts[ncd][category]["month"][month_index].append(article["PMID"]) counts[ncd][category]["month"][month_index].append(article["PMID"])
def mesh_term_present(article_mesh_terms, mesh_term):
if isinstance(mesh_term, list):
all_in = True
for part in mesh_term:
if part not in article_mesh_terms:
all_in = False
return all_in
else:
return mesh_term in article_mesh_terms
CATEGORIES = [
"NO KEYWORDS",
"KEYWORDS",
"SUBHEADINGS",
"SITE PROPOSITION",
"PROPOSITION"
]
INTERVALS = [
"day",
"week",
"month"
]
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data"))
TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./tmp"))
file_path = f"{TMP_DIR}/save_3_years.json"
with open(file_path, "r", encoding="utf-8") as file: with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file) data = json.load(file)
...@@ -101,69 +86,23 @@ keywords_proposition_mesh_terms = lower_keywords(KEYWORDS_MESH_PROPOSITION) ...@@ -101,69 +86,23 @@ keywords_proposition_mesh_terms = lower_keywords(KEYWORDS_MESH_PROPOSITION)
counts = {} counts = {}
for ncd in ncds_mesh_terms: counts["ALL"] = {}
counts[ncd] = {
"KEYWORDS" : {
"day": {},
"week": {},
"month": {}
},
"SUBHEADINGS" : {
"day": {},
"week": {},
"month": {}
},
"SITE PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
"PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
}
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)
current_date = start_date
while(current_date < end_date): for category in CATEGORIES:
day_index, week_index, month_index = get_date_indices(current_date, start_date) counts["ALL"][category] = {}
for interval in INTERVALS:
counts["ALL"][category][interval] = {}
for category in CATEGORIES: for ncd in ncds_mesh_terms:
counts[ncd][category]["day"][day_index] = [] counts[ncd] = {}
counts[ncd][category]["week"][week_index] = [] for category in CATEGORIES:
counts[ncd][category]["month"][month_index] = [] counts[ncd][category] = {}
for interval in INTERVALS:
counts[ncd][category][interval] = {}
current_date += timedelta(days=1)
counts["ALL"] = {
"KEYWORDS" : {
"day": {},
"week": {},
"month": {}
},
"SUBHEADINGS" : {
"day": {},
"week": {},
"month": {}
},
"SITE PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
"PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
}
start_date = datetime(2022, 1, 1) start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31) end_date = datetime(2025, 1, 30)
current_date = start_date current_date = start_date
while(current_date < end_date): while(current_date < end_date):
...@@ -174,134 +113,80 @@ while(current_date < end_date): ...@@ -174,134 +113,80 @@ while(current_date < end_date):
counts["ALL"][category]["week"][week_index] = [] counts["ALL"][category]["week"][week_index] = []
counts["ALL"][category]["month"][month_index] = [] counts["ALL"][category]["month"][month_index] = []
for ncd in ncds_mesh_terms:
for category in CATEGORIES:
counts[ncd][category]["day"][day_index] = []
counts[ncd][category]["week"][week_index] = []
counts[ncd][category]["month"][month_index] = []
current_date += timedelta(days=1) current_date += timedelta(days=1)
for article in data: for article in data:
mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]] article_mesh_terms = [mesh_term.lower() for mesh_term in article["MeshTerms"]]
article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"]))
if "ALL" not in counts:
counts["ALL"] = {
"KEYWORDS" : {
"day": {},
"week": {},
"month": {}
},
"SUBHEADINGS" : {
"day": {},
"week": {},
"month": {}
},
"SITE PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
"PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
}
for ncd in ncds_mesh_terms: for ncd in ncds_mesh_terms:
if ncd not in counts:
counts[ncd] = {
"KEYWORDS" : {
"day": {},
"week": {},
"month": {}
},
"SUBHEADINGS" : {
"day": {},
"week": {},
"month": {}
},
"SITE PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
"PROPOSITION" : {
"day": {},
"week": {},
"month": {}
},
}
for keyword in keywords_mesh_terms:
if match_mesh_terms(mesh_terms, ncd, keyword):
init_index("KEYWORDS", counts, ncd, article_date)
init_index("SUBHEADINGS", counts, ncd, article_date)
init_index("SITE PROPOSITION", counts, ncd, article_date)
init_index("PROPOSITION", counts, ncd, article_date)
add_article(article, "KEYWORDS", counts, ncd, article_date) if mesh_term_present(article_mesh_terms, ncd):
add_article(article, "SUBHEADINGS", counts, ncd, article_date)
add_article(article, "SITE PROPOSITION", counts, ncd, article_date)
add_article(article, "PROPOSITION", counts, ncd, article_date)
init_index("KEYWORDS", counts, "ALL", article_date) add_article(article, CATEGORIES[0], counts, ncd, article_date)
init_index("SUBHEADINGS", counts, "ALL", article_date) add_article(article, CATEGORIES[0], counts, "ALL", article_date)
init_index("SITE PROPOSITION", counts, "ALL", article_date)
init_index("PROPOSITION", counts, "ALL", article_date)
add_article(article, "KEYWORDS", counts, "ALL", article_date) added = False
add_article(article, "SUBHEADINGS", counts, "ALL", article_date)
add_article(article, "SITE PROPOSITION", counts, "ALL", article_date)
add_article(article, "PROPOSITION", counts, "ALL", article_date)
for keyword in keywords_subheading_mesh_terms: for keyword in keywords_mesh_terms:
if added:
break
if match_mesh_terms(mesh_terms, ncd, keyword): if mesh_term_present(article_mesh_terms, keyword):
init_index("SUBHEADINGS", counts, ncd, article_date)
init_index("SITE PROPOSITION", counts, ncd, article_date)
init_index("PROPOSITION", counts, ncd, article_date)
add_article(article, "SUBHEADINGS", counts, ncd, article_date) for category in CATEGORIES[1:]:
add_article(article, "SITE PROPOSITION", counts, ncd, article_date) add_article(article, category, counts, ncd, article_date)
add_article(article, "PROPOSITION", counts, ncd, article_date) add_article(article, category, counts, "ALL", article_date)
init_index("SUBHEADINGS", counts, "ALL", article_date) added = True
init_index("SITE PROPOSITION", counts, "ALL", article_date)
init_index("PROPOSITION", counts, "ALL", article_date)
add_article(article, "SUBHEADINGS", counts, "ALL", article_date) for keyword in keywords_subheading_mesh_terms:
add_article(article, "SITE PROPOSITION", counts, "ALL", article_date) if added:
add_article(article, "PROPOSITION", counts, "ALL", article_date) break
for keyword in keywords_site_proposition_mesh_terms: if mesh_term_present(article_mesh_terms, keyword):
if match_mesh_terms(mesh_terms, ncd, keyword): for category in CATEGORIES[2:]:
init_index("SITE PROPOSITION", counts, ncd, article_date) add_article(article, category, counts, ncd, article_date)
init_index("PROPOSITION", counts, ncd, article_date) add_article(article, category, counts, "ALL", article_date)
added = True
add_article(article, "SITE PROPOSITION", counts, ncd, article_date) for keyword in keywords_site_proposition_mesh_terms:
add_article(article, "PROPOSITION", counts, ncd, article_date) if added:
break
init_index("SITE PROPOSITION", counts, "ALL", article_date) if mesh_term_present(article_mesh_terms, keyword):
init_index("PROPOSITION", counts, "ALL", article_date)
add_article(article, "SITE PROPOSITION", counts, "ALL", article_date) for category in CATEGORIES[3:]:
add_article(article, "PROPOSITION", counts, "ALL", article_date) add_article(article, category, counts, ncd, article_date)
add_article(article, category, counts, "ALL", article_date)
for keyword in keywords_proposition_mesh_terms: added = True
if match_mesh_terms(mesh_terms, ncd, keyword): for keyword in keywords_proposition_mesh_terms:
init_index("PROPOSITION", counts, ncd, article_date) if added:
break
add_article(article, "PROPOSITION", counts, ncd, article_date) if mesh_term_present(article_mesh_terms, keyword):
init_index("PROPOSITION", counts, "ALL", article_date) for category in CATEGORIES[4:]:
add_article(article, category, counts, ncd, article_date)
add_article(article, category, counts, "ALL", article_date)
add_article(article, "PROPOSITION", counts, "ALL", article_date) added = True
for ncd in ncds_mesh_terms: for ncd in ncds_mesh_terms:
for category in CATEGORIES: for category in CATEGORIES:
for interval in INTERVALS: for interval in INTERVALS:
counts[ncd][category][interval] = [len(tmp) for key, tmp in counts[ncd][category][interval].items()] counts[ncd][category][interval] = [len(tmp) for _, tmp in counts[ncd][category][interval].items()]
counts[ncd][category][interval] = { counts[ncd][category][interval] = {
"min": min(counts[ncd][category][interval]), "min": min(counts[ncd][category][interval]),
......
import sys
import os
import json
# Ajouter le répertoire parent au chemin de recherche
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./tmp"))
data = []
with open(f"{TMP_DIR}/save_3_years.json", "r", encoding="utf-8") as file:
data = json.load(file)
print(len(data))
\ No newline at end of file
...@@ -62,6 +62,9 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store ...@@ -62,6 +62,9 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store
# obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}") # obj = parseXmlFile(f"{TMP_DIR}/{TMP_FILENAME}")
obj = xmltodict.parse(response.text) obj = xmltodict.parse(response.text)
if "PubmedArticleSet" not in obj:
return []
obj = obj["PubmedArticleSet"] obj = obj["PubmedArticleSet"]
print() print()
......
...@@ -11,27 +11,38 @@ from dataSources.PubMed.pubmedApi import getPubmedData ...@@ -11,27 +11,38 @@ from dataSources.PubMed.pubmedApi import getPubmedData
from variables.pubmed import * from variables.pubmed import *
from dataSources.PubMed.util import * from dataSources.PubMed.util import *
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./data")) TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./tmp"))
ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM) ncds_mesh_noexp = get_mesh_noexp_term(NCDS_MESH_TERM)
search_term = url_encode(" OR ".join(ncds_mesh_noexp)) search_term = url_encode(" OR ".join(ncds_mesh_noexp))
data = [] data_to_store = []
with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: with open(f"{TMP_DIR}/save_3_years.json", "w") as json_file:
json.dump(data, json_file, indent=4) json.dump(data_to_store, json_file, indent=4)
current_date = datetime(2022, 1, 1) current_date = datetime(2022, 1, 1)
stored_pmid = []
while(current_date < datetime(2024, 12, 31)): while(current_date < datetime(2024, 12, 31)):
next_date = current_date + timedelta(weeks=1) next_date = current_date + timedelta(days=4)
data = getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d"))
if len(data) > 10000:
print("ERROR: MORE THAN 10000 ARTICLES")
exit(1)
data += getPubmedData(search_term, current_date.strftime("%Y/%m/%d"), next_date.strftime("%Y/%m/%d")) for article in data:
if article["PMID"] not in stored_pmid:
data_to_store.append(article)
stored_pmid.append(article["PMID"])
current_date = next_date current_date = next_date
time.sleep(0.1) time.sleep(0.1)
with open(f"{DATA_DIR}/save_3_years.json", "w") as json_file: with open(f"{TMP_DIR}/save_3_years.json", "w") as json_file:
json.dump(data, json_file, indent=4) json.dump(data_to_store, json_file, indent=4)
\ No newline at end of file \ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment