diff --git a/models/LLM/Tokenizer/doc/token_count.json b/models/LLM/Tokenizer/doc/token_count.json index cbbf029ee14f9d6577f46f67c6c0f8f3eaabbe33..fd0f4dd3b4341e6fe5e9920c705560eaee17e112 100644 --- a/models/LLM/Tokenizer/doc/token_count.json +++ b/models/LLM/Tokenizer/doc/token_count.json @@ -3,87 +3,578 @@ "ALL": 62940760, "NO KEYWORDS": { "day": { - "min": 0, - "max": 336390, - "mean": 55947.34222222222 + "input": { + "min": 0, + "max": 336390, + "mean": 55947.34222222222 + }, + "output": { + "min": 0, + "max": 100400, + "mean": 16410.31111111111 + } }, "week": { - "min": 0, - "max": 610773, - "mean": 390936.39751552796 + "input": { + "min": 0, + "max": 610773, + "mean": 390936.39751552796 + }, + "output": { + "min": 0, + "max": 181200, + "mean": 114668.32298136647 + } }, "month": { - "min": 149220, - "max": 1988608, - "mean": 1701101.6216216215 + "input": { + "min": 149220, + "max": 1988608, + "mean": 1701101.6216216215 + }, + "output": { + "min": 40400, + "max": 587200, + "mean": 498962.1621621622 + } } }, "KEYWORDS": { "day": { - "min": 0, - "max": 14061, - "mean": 2494.1857777777777 + "input": { + "min": 0, + "max": 14061, + "mean": 2494.1857777777777 + }, + "output": { + "min": 0, + "max": 3900, + "mean": 728.5333333333333 + } }, "week": { - "min": 0, - "max": 28111, - "mean": 17428.316770186335 + "input": { + "min": 0, + "max": 28111, + "mean": 17428.316770186335 + }, + "output": { + "min": 0, + "max": 8300, + "mean": 5090.683229813664 + } }, "month": { - "min": 12058, - "max": 105204, - "mean": 75836.72972972973 + "input": { + "min": 12058, + "max": 105204, + "mean": 75836.72972972973 + }, + "output": { + "min": 3100, + "max": 29500, + "mean": 22151.35135135135 + } } }, "SUBHEADINGS": { "day": { - "min": 0, - "max": 14061, - "mean": 2494.1857777777777 + "input": { + "min": 0, + "max": 14061, + "mean": 2494.1857777777777 + }, + "output": { + "min": 0, + "max": 3900, + "mean": 728.5333333333333 + } }, "week": { - "min": 0, - "max": 28111, - "mean": 17428.316770186335 + "input": { + "min": 0, + "max": 28111, + "mean": 17428.316770186335 + }, + "output": { + "min": 0, + "max": 8300, + "mean": 5090.683229813664 + } }, "month": { - "min": 12058, - "max": 105204, - "mean": 75836.72972972973 + "input": { + "min": 12058, + "max": 105204, + "mean": 75836.72972972973 + }, + "output": { + "min": 3100, + "max": 29500, + "mean": 22151.35135135135 + } } }, "SITE PROPOSITION": { "day": { - "min": 0, - "max": 17409, - "mean": 3292.2702222222224 + "input": { + "min": 0, + "max": 17409, + "mean": 3292.2702222222224 + }, + "output": { + "min": 0, + "max": 4700, + "mean": 943.5555555555555 + } }, "week": { - "min": 0, - "max": 36705, - "mean": 23004.993788819876 + "input": { + "min": 0, + "max": 36705, + "mean": 23004.993788819876 + }, + "output": { + "min": 0, + "max": 11200, + "mean": 6593.167701863354 + } }, "month": { - "min": 13250, - "max": 124682, - "mean": 100102.81081081081 + "input": { + "min": 13250, + "max": 124682, + "mean": 100102.81081081081 + }, + "output": { + "min": 3400, + "max": 34700, + "mean": 28689.18918918919 + } } }, "PROPOSITION": { "day": { - "min": 0, - "max": 24471, - "mean": 4493.711111111111 + "input": { + "min": 0, + "max": 24471, + "mean": 4493.711111111111 + }, + "output": { + "min": 0, + "max": 6700, + "mean": 1297.1555555555556 + } }, "week": { - "min": 0, - "max": 49793, - "mean": 31400.155279503106 + "input": { + "min": 0, + "max": 49793, + "mean": 31400.155279503106 + }, + "output": { + "min": 0, + "max": 15300, + "mean": 9063.975155279502 + } }, "month": { - "min": 17661, - "max": 172341, - "mean": 136633.1081081081 + "input": { + "min": 17661, + "max": 172341, + "mean": 136633.1081081081 + }, + "output": { + "min": 4600, + "max": 48200, + "mean": 39440.54054054054 + } + } + } + }, + "roberta-base": { + "ALL": 61502461, + "NO KEYWORDS": { + "day": { + "input": { + "min": 0, + "max": 327932, + "mean": 54668.854222222224 + }, + "output": { + "min": 0, + "max": 100400, + "mean": 16410.31111111111 + } + }, + "week": { + "input": { + "min": 0, + "max": 597010, + "mean": 382002.8633540373 + }, + "output": { + "min": 0, + "max": 181200, + "mean": 114668.32298136647 + } + }, + "month": { + "input": { + "min": 145722, + "max": 1940671, + "mean": 1662228.6756756757 + }, + "output": { + "min": 40400, + "max": 587200, + "mean": 498962.1621621622 + } + } + }, + "KEYWORDS": { + "day": { + "input": { + "min": 0, + "max": 13765, + "mean": 2451.286222222222 + }, + "output": { + "min": 0, + "max": 3900, + "mean": 728.5333333333333 + } + }, + "week": { + "input": { + "min": 0, + "max": 27593, + "mean": 17128.552795031057 + }, + "output": { + "min": 0, + "max": 8300, + "mean": 5090.683229813664 + } + }, + "month": { + "input": { + "min": 11886, + "max": 103256, + "mean": 74532.35135135135 + }, + "output": { + "min": 3100, + "max": 29500, + "mean": 22151.35135135135 + } + } + }, + "SUBHEADINGS": { + "day": { + "input": { + "min": 0, + "max": 13765, + "mean": 2451.286222222222 + }, + "output": { + "min": 0, + "max": 3900, + "mean": 728.5333333333333 + } + }, + "week": { + "input": { + "min": 0, + "max": 27593, + "mean": 17128.552795031057 + }, + "output": { + "min": 0, + "max": 8300, + "mean": 5090.683229813664 + } + }, + "month": { + "input": { + "min": 11886, + "max": 103256, + "mean": 74532.35135135135 + }, + "output": { + "min": 3100, + "max": 29500, + "mean": 22151.35135135135 + } + } + }, + "SITE PROPOSITION": { + "day": { + "input": { + "min": 0, + "max": 17097, + "mean": 3239.8124444444443 + }, + "output": { + "min": 0, + "max": 4700, + "mean": 943.5555555555555 + } + }, + "week": { + "input": { + "min": 0, + "max": 36147, + "mean": 22638.44099378882 + }, + "output": { + "min": 0, + "max": 11200, + "mean": 6593.167701863354 + } + }, + "month": { + "input": { + "min": 13077, + "max": 122603, + "mean": 98507.81081081081 + }, + "output": { + "min": 3400, + "max": 34700, + "mean": 28689.18918918919 + } + } + }, + "PROPOSITION": { + "day": { + "input": { + "min": 0, + "max": 23993, + "mean": 4422.054222222222 + }, + "output": { + "min": 0, + "max": 6700, + "mean": 1297.1555555555556 + } + }, + "week": { + "input": { + "min": 0, + "max": 48994, + "mean": 30899.447204968943 + }, + "output": { + "min": 0, + "max": 15300, + "mean": 9063.975155279502 + } + }, + "month": { + "input": { + "min": 17429, + "max": 169516, + "mean": 134454.35135135136 + }, + "output": { + "min": 4600, + "max": 48200, + "mean": 39440.54054054054 + } + } + } + }, + "facebook/bart-large": { + "ALL": 61502461, + "NO KEYWORDS": { + "day": { + "input": { + "min": 0, + "max": 327932, + "mean": 54668.854222222224 + }, + "output": { + "min": 0, + "max": 100400, + "mean": 16410.31111111111 + } + }, + "week": { + "input": { + "min": 0, + "max": 597010, + "mean": 382002.8633540373 + }, + "output": { + "min": 0, + "max": 181200, + "mean": 114668.32298136647 + } + }, + "month": { + "input": { + "min": 145722, + "max": 1940671, + "mean": 1662228.6756756757 + }, + "output": { + "min": 40400, + "max": 587200, + "mean": 498962.1621621622 + } + } + }, + "KEYWORDS": { + "day": { + "input": { + "min": 0, + "max": 13765, + "mean": 2451.286222222222 + }, + "output": { + "min": 0, + "max": 3900, + "mean": 728.5333333333333 + } + }, + "week": { + "input": { + "min": 0, + "max": 27593, + "mean": 17128.552795031057 + }, + "output": { + "min": 0, + "max": 8300, + "mean": 5090.683229813664 + } + }, + "month": { + "input": { + "min": 11886, + "max": 103256, + "mean": 74532.35135135135 + }, + "output": { + "min": 3100, + "max": 29500, + "mean": 22151.35135135135 + } + } + }, + "SUBHEADINGS": { + "day": { + "input": { + "min": 0, + "max": 13765, + "mean": 2451.286222222222 + }, + "output": { + "min": 0, + "max": 3900, + "mean": 728.5333333333333 + } + }, + "week": { + "input": { + "min": 0, + "max": 27593, + "mean": 17128.552795031057 + }, + "output": { + "min": 0, + "max": 8300, + "mean": 5090.683229813664 + } + }, + "month": { + "input": { + "min": 11886, + "max": 103256, + "mean": 74532.35135135135 + }, + "output": { + "min": 3100, + "max": 29500, + "mean": 22151.35135135135 + } + } + }, + "SITE PROPOSITION": { + "day": { + "input": { + "min": 0, + "max": 17097, + "mean": 3239.8124444444443 + }, + "output": { + "min": 0, + "max": 4700, + "mean": 943.5555555555555 + } + }, + "week": { + "input": { + "min": 0, + "max": 36147, + "mean": 22638.44099378882 + }, + "output": { + "min": 0, + "max": 11200, + "mean": 6593.167701863354 + } + }, + "month": { + "input": { + "min": 13077, + "max": 122603, + "mean": 98507.81081081081 + }, + "output": { + "min": 3400, + "max": 34700, + "mean": 28689.18918918919 + } + } + }, + "PROPOSITION": { + "day": { + "input": { + "min": 0, + "max": 23993, + "mean": 4422.054222222222 + }, + "output": { + "min": 0, + "max": 6700, + "mean": 1297.1555555555556 + } + }, + "week": { + "input": { + "min": 0, + "max": 48994, + "mean": 30899.447204968943 + }, + "output": { + "min": 0, + "max": 15300, + "mean": 9063.975155279502 + } + }, + "month": { + "input": { + "min": 17429, + "max": 169516, + "mean": 134454.35135135136 + }, + "output": { + "min": 4600, + "max": 48200, + "mean": 39440.54054054054 + } } } } diff --git a/models/LLM/Tokenizer/token_count.py b/models/LLM/Tokenizer/token_count.py index 051857de6b3da053d5e58ed3c5eccbfbbd9a329d..93c7a4d12e1f266a999ef175b9cec8e9b3759ec5 100644 --- a/models/LLM/Tokenizer/token_count.py +++ b/models/LLM/Tokenizer/token_count.py @@ -28,7 +28,11 @@ CATEGORIES = [ ] TOKENIZERS = [ - "bert-base-uncased" + #"openai-community/gpt-4", + #"meta-llama/Llama-2-7b-hf", + "bert-base-uncased", + "roberta-base", + "facebook/bart-large" ] def lower_keywords(mesh_terms): @@ -64,12 +68,15 @@ def get_date_indices(date, start_date): return day_index, week_index, month_index -def add_num_token(article_date, start_date, token_num, counts, tokenizer_name, category): +def add_token_number(article_date, start_date, input_token_num, output_token_num, counts, tokenizer_name, category): day_index, week_index, month_index = get_date_indices(article_date, start_date) - counts[tokenizer_name][category]["day"][day_index] += token_num - counts[tokenizer_name][category]["week"][week_index] += token_num - counts[tokenizer_name][category]["month"][month_index] += token_num + counts[tokenizer_name][category]["day"]["input"][day_index] += input_token_num + counts[tokenizer_name][category]["day"]["output"][day_index] += output_token_num + counts[tokenizer_name][category]["week"]["input"][week_index] += input_token_num + counts[tokenizer_name][category]["week"]["output"][week_index] += output_token_num + counts[tokenizer_name][category]["month"]["input"][month_index] += input_token_num + counts[tokenizer_name][category]["month"]["output"][month_index] += output_token_num ncds_mesh_terms = [mesh_term.lower() for ncd, mesh_term in NCDS_MESH_TERM.items()] @@ -92,9 +99,18 @@ for tokenizer_name in TOKENIZERS: counts[tokenizer_name]["ALL"] = 0 for category in CATEGORIES: counts[tokenizer_name][category] = { - "day": {}, - "week": {}, - "month": {}, + "day": { + "input": {}, + "output": {} + }, + "week": { + "input": {}, + "output": {} + }, + "month": { + "input": {}, + "output": {} + } } start_date = datetime(2022, 1, 1) @@ -106,9 +122,12 @@ while(current_date < end_date): for tokenizer_name in TOKENIZERS: for category in CATEGORIES: - counts[tokenizer_name][category]["day"][day_index] = 0 - counts[tokenizer_name][category]["week"][week_index] = 0 - counts[tokenizer_name][category]["month"][month_index] = 0 + counts[tokenizer_name][category]["day"]["input"][day_index] = 0 + counts[tokenizer_name][category]["day"]["output"][day_index] = 0 + counts[tokenizer_name][category]["week"]["input"][week_index] = 0 + counts[tokenizer_name][category]["week"]["output"][week_index] = 0 + counts[tokenizer_name][category]["month"]["input"][month_index] = 0 + counts[tokenizer_name][category]["month"]["output"][month_index] = 0 current_date += timedelta(days=1) @@ -118,6 +137,8 @@ for tokenizer_name in TOKENIZERS: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + output_token_num = 100 + i = 1 for article in data: print(f"Article N°{i}") @@ -129,7 +150,7 @@ for tokenizer_name in TOKENIZERS: tokens = tokenizer(title+abstract, return_tensors="pt") num_tokens = len(tokens["input_ids"][0]) - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "NO KEYWORDS") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "NO KEYWORDS") counts[tokenizer_name]["ALL"] += num_tokens added = False @@ -144,47 +165,48 @@ for tokenizer_name in TOKENIZERS: if added: break if mesh_term_present(article_mesh_terms, keyword): - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "KEYWORDS") - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SUBHEADINGS") - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SITE PROPOSITION") - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "KEYWORDS") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SUBHEADINGS") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SITE PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION") added = True for keyword in keywords_subheading_mesh_terms: if added: break if mesh_term_present(article_mesh_terms, keyword): - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SUBHEADINGS") - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SITE PROPOSITION") - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SUBHEADINGS") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SITE PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION") added = True for keyword in keywords_site_proposition_mesh_terms: if added: break if mesh_term_present(article_mesh_terms, keyword): - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SITE PROPOSITION") - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SITE PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION") added = True for keyword in keywords_proposition_mesh_terms: if added: break if mesh_term_present(article_mesh_terms, keyword): - add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION") + add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION") added = True i += 1 for category in CATEGORIES: for interval in INTERVALS: - counts[tokenizer_name][category][interval] = [val for _, val in counts[tokenizer_name][category][interval].items()] - - counts[tokenizer_name][category][interval] = { - "min": min(counts[tokenizer_name][category][interval]), - "max": max(counts[tokenizer_name][category][interval]), - "mean": statistics.mean(counts[tokenizer_name][category][interval]) - } + for i in ["input", "output"]: + counts[tokenizer_name][category][interval][i] = [val for _, val in counts[tokenizer_name][category][interval][i].items()] + + counts[tokenizer_name][category][interval][i] = { + "min": min(counts[tokenizer_name][category][interval][i]), + "max": max(counts[tokenizer_name][category][interval][i]), + "mean": statistics.mean(counts[tokenizer_name][category][interval][i]) + } with open(f"{DOC_DIR}/token_count.json", "w") as json_file: json.dump(counts, json_file, indent=4) diff --git a/models/LLM/prices/calc_llm_prices.py b/models/LLM/prices/calc_llm_prices.py new file mode 100644 index 0000000000000000000000000000000000000000..51c9ff52ec017f748a24c4e04f4aee003195e69f --- /dev/null +++ b/models/LLM/prices/calc_llm_prices.py @@ -0,0 +1,13 @@ + +# Prices for 1M tokens +PRICES = { + 'Mistral Large': { 'input': 2, 'output': 6}, + 'Mistral Small': { 'input': 0.1, 'output': 0.3}, + 'GPT-4o': { 'input': 0.1, 'output': 0.3}, + 'Mistral Small': { 'input': 0.1, 'output': 0.3}, + 'Mistral Small': { 'input': 0.1, 'output': 0.3}, + 'Mistral Small': { 'input': 0.1, 'output': 0.3}, + 'Mistral Small': { 'input': 0.1, 'output': 0.3}, + 'Mistral Small': { 'input': 0.1, 'output': 0.3}, + +} \ No newline at end of file