Skip to content
Snippets Groups Projects
Commit e8f3c549 authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Added 2 more tokenizers and starting price calculation

parent aeac2c49
No related branches found
No related tags found
No related merge requests found
......@@ -3,87 +3,578 @@
"ALL": 62940760,
"NO KEYWORDS": {
"day": {
"input": {
"min": 0,
"max": 336390,
"mean": 55947.34222222222
},
"output": {
"min": 0,
"max": 100400,
"mean": 16410.31111111111
}
},
"week": {
"input": {
"min": 0,
"max": 610773,
"mean": 390936.39751552796
},
"output": {
"min": 0,
"max": 181200,
"mean": 114668.32298136647
}
},
"month": {
"input": {
"min": 149220,
"max": 1988608,
"mean": 1701101.6216216215
},
"output": {
"min": 40400,
"max": 587200,
"mean": 498962.1621621622
}
}
},
"KEYWORDS": {
"day": {
"input": {
"min": 0,
"max": 14061,
"mean": 2494.1857777777777
},
"output": {
"min": 0,
"max": 3900,
"mean": 728.5333333333333
}
},
"week": {
"input": {
"min": 0,
"max": 28111,
"mean": 17428.316770186335
},
"output": {
"min": 0,
"max": 8300,
"mean": 5090.683229813664
}
},
"month": {
"input": {
"min": 12058,
"max": 105204,
"mean": 75836.72972972973
},
"output": {
"min": 3100,
"max": 29500,
"mean": 22151.35135135135
}
}
},
"SUBHEADINGS": {
"day": {
"input": {
"min": 0,
"max": 14061,
"mean": 2494.1857777777777
},
"output": {
"min": 0,
"max": 3900,
"mean": 728.5333333333333
}
},
"week": {
"input": {
"min": 0,
"max": 28111,
"mean": 17428.316770186335
},
"output": {
"min": 0,
"max": 8300,
"mean": 5090.683229813664
}
},
"month": {
"input": {
"min": 12058,
"max": 105204,
"mean": 75836.72972972973
},
"output": {
"min": 3100,
"max": 29500,
"mean": 22151.35135135135
}
}
},
"SITE PROPOSITION": {
"day": {
"input": {
"min": 0,
"max": 17409,
"mean": 3292.2702222222224
},
"output": {
"min": 0,
"max": 4700,
"mean": 943.5555555555555
}
},
"week": {
"input": {
"min": 0,
"max": 36705,
"mean": 23004.993788819876
},
"output": {
"min": 0,
"max": 11200,
"mean": 6593.167701863354
}
},
"month": {
"input": {
"min": 13250,
"max": 124682,
"mean": 100102.81081081081
},
"output": {
"min": 3400,
"max": 34700,
"mean": 28689.18918918919
}
}
},
"PROPOSITION": {
"day": {
"input": {
"min": 0,
"max": 24471,
"mean": 4493.711111111111
},
"output": {
"min": 0,
"max": 6700,
"mean": 1297.1555555555556
}
},
"week": {
"input": {
"min": 0,
"max": 49793,
"mean": 31400.155279503106
},
"output": {
"min": 0,
"max": 15300,
"mean": 9063.975155279502
}
},
"month": {
"input": {
"min": 17661,
"max": 172341,
"mean": 136633.1081081081
},
"output": {
"min": 4600,
"max": 48200,
"mean": 39440.54054054054
}
}
}
},
"roberta-base": {
"ALL": 61502461,
"NO KEYWORDS": {
"day": {
"input": {
"min": 0,
"max": 327932,
"mean": 54668.854222222224
},
"output": {
"min": 0,
"max": 100400,
"mean": 16410.31111111111
}
},
"week": {
"input": {
"min": 0,
"max": 597010,
"mean": 382002.8633540373
},
"output": {
"min": 0,
"max": 181200,
"mean": 114668.32298136647
}
},
"month": {
"input": {
"min": 145722,
"max": 1940671,
"mean": 1662228.6756756757
},
"output": {
"min": 40400,
"max": 587200,
"mean": 498962.1621621622
}
}
},
"KEYWORDS": {
"day": {
"input": {
"min": 0,
"max": 13765,
"mean": 2451.286222222222
},
"output": {
"min": 0,
"max": 3900,
"mean": 728.5333333333333
}
},
"week": {
"input": {
"min": 0,
"max": 27593,
"mean": 17128.552795031057
},
"output": {
"min": 0,
"max": 8300,
"mean": 5090.683229813664
}
},
"month": {
"input": {
"min": 11886,
"max": 103256,
"mean": 74532.35135135135
},
"output": {
"min": 3100,
"max": 29500,
"mean": 22151.35135135135
}
}
},
"SUBHEADINGS": {
"day": {
"input": {
"min": 0,
"max": 13765,
"mean": 2451.286222222222
},
"output": {
"min": 0,
"max": 3900,
"mean": 728.5333333333333
}
},
"week": {
"input": {
"min": 0,
"max": 27593,
"mean": 17128.552795031057
},
"output": {
"min": 0,
"max": 8300,
"mean": 5090.683229813664
}
},
"month": {
"input": {
"min": 11886,
"max": 103256,
"mean": 74532.35135135135
},
"output": {
"min": 3100,
"max": 29500,
"mean": 22151.35135135135
}
}
},
"SITE PROPOSITION": {
"day": {
"input": {
"min": 0,
"max": 17097,
"mean": 3239.8124444444443
},
"output": {
"min": 0,
"max": 4700,
"mean": 943.5555555555555
}
},
"week": {
"input": {
"min": 0,
"max": 36147,
"mean": 22638.44099378882
},
"output": {
"min": 0,
"max": 11200,
"mean": 6593.167701863354
}
},
"month": {
"input": {
"min": 13077,
"max": 122603,
"mean": 98507.81081081081
},
"output": {
"min": 3400,
"max": 34700,
"mean": 28689.18918918919
}
}
},
"PROPOSITION": {
"day": {
"input": {
"min": 0,
"max": 23993,
"mean": 4422.054222222222
},
"output": {
"min": 0,
"max": 6700,
"mean": 1297.1555555555556
}
},
"week": {
"input": {
"min": 0,
"max": 48994,
"mean": 30899.447204968943
},
"output": {
"min": 0,
"max": 15300,
"mean": 9063.975155279502
}
},
"month": {
"input": {
"min": 17429,
"max": 169516,
"mean": 134454.35135135136
},
"output": {
"min": 4600,
"max": 48200,
"mean": 39440.54054054054
}
}
}
},
"facebook/bart-large": {
"ALL": 61502461,
"NO KEYWORDS": {
"day": {
"input": {
"min": 0,
"max": 327932,
"mean": 54668.854222222224
},
"output": {
"min": 0,
"max": 100400,
"mean": 16410.31111111111
}
},
"week": {
"input": {
"min": 0,
"max": 597010,
"mean": 382002.8633540373
},
"output": {
"min": 0,
"max": 181200,
"mean": 114668.32298136647
}
},
"month": {
"input": {
"min": 145722,
"max": 1940671,
"mean": 1662228.6756756757
},
"output": {
"min": 40400,
"max": 587200,
"mean": 498962.1621621622
}
}
},
"KEYWORDS": {
"day": {
"input": {
"min": 0,
"max": 13765,
"mean": 2451.286222222222
},
"output": {
"min": 0,
"max": 3900,
"mean": 728.5333333333333
}
},
"week": {
"input": {
"min": 0,
"max": 27593,
"mean": 17128.552795031057
},
"output": {
"min": 0,
"max": 8300,
"mean": 5090.683229813664
}
},
"month": {
"input": {
"min": 11886,
"max": 103256,
"mean": 74532.35135135135
},
"output": {
"min": 3100,
"max": 29500,
"mean": 22151.35135135135
}
}
},
"SUBHEADINGS": {
"day": {
"input": {
"min": 0,
"max": 13765,
"mean": 2451.286222222222
},
"output": {
"min": 0,
"max": 3900,
"mean": 728.5333333333333
}
},
"week": {
"input": {
"min": 0,
"max": 27593,
"mean": 17128.552795031057
},
"output": {
"min": 0,
"max": 8300,
"mean": 5090.683229813664
}
},
"month": {
"input": {
"min": 11886,
"max": 103256,
"mean": 74532.35135135135
},
"output": {
"min": 3100,
"max": 29500,
"mean": 22151.35135135135
}
}
},
"SITE PROPOSITION": {
"day": {
"input": {
"min": 0,
"max": 17097,
"mean": 3239.8124444444443
},
"output": {
"min": 0,
"max": 4700,
"mean": 943.5555555555555
}
},
"week": {
"input": {
"min": 0,
"max": 36147,
"mean": 22638.44099378882
},
"output": {
"min": 0,
"max": 11200,
"mean": 6593.167701863354
}
},
"month": {
"input": {
"min": 13077,
"max": 122603,
"mean": 98507.81081081081
},
"output": {
"min": 3400,
"max": 34700,
"mean": 28689.18918918919
}
}
},
"PROPOSITION": {
"day": {
"input": {
"min": 0,
"max": 23993,
"mean": 4422.054222222222
},
"output": {
"min": 0,
"max": 6700,
"mean": 1297.1555555555556
}
},
"week": {
"input": {
"min": 0,
"max": 48994,
"mean": 30899.447204968943
},
"output": {
"min": 0,
"max": 15300,
"mean": 9063.975155279502
}
},
"month": {
"input": {
"min": 17429,
"max": 169516,
"mean": 134454.35135135136
},
"output": {
"min": 4600,
"max": 48200,
"mean": 39440.54054054054
}
}
}
}
......
......@@ -28,7 +28,11 @@ CATEGORIES = [
]
TOKENIZERS = [
"bert-base-uncased"
#"openai-community/gpt-4",
#"meta-llama/Llama-2-7b-hf",
"bert-base-uncased",
"roberta-base",
"facebook/bart-large"
]
def lower_keywords(mesh_terms):
......@@ -64,12 +68,15 @@ def get_date_indices(date, start_date):
return day_index, week_index, month_index
def add_num_token(article_date, start_date, token_num, counts, tokenizer_name, category):
def add_token_number(article_date, start_date, input_token_num, output_token_num, counts, tokenizer_name, category):
day_index, week_index, month_index = get_date_indices(article_date, start_date)
counts[tokenizer_name][category]["day"][day_index] += token_num
counts[tokenizer_name][category]["week"][week_index] += token_num
counts[tokenizer_name][category]["month"][month_index] += token_num
counts[tokenizer_name][category]["day"]["input"][day_index] += input_token_num
counts[tokenizer_name][category]["day"]["output"][day_index] += output_token_num
counts[tokenizer_name][category]["week"]["input"][week_index] += input_token_num
counts[tokenizer_name][category]["week"]["output"][week_index] += output_token_num
counts[tokenizer_name][category]["month"]["input"][month_index] += input_token_num
counts[tokenizer_name][category]["month"]["output"][month_index] += output_token_num
ncds_mesh_terms = [mesh_term.lower() for ncd, mesh_term in NCDS_MESH_TERM.items()]
......@@ -92,9 +99,18 @@ for tokenizer_name in TOKENIZERS:
counts[tokenizer_name]["ALL"] = 0
for category in CATEGORIES:
counts[tokenizer_name][category] = {
"day": {},
"week": {},
"month": {},
"day": {
"input": {},
"output": {}
},
"week": {
"input": {},
"output": {}
},
"month": {
"input": {},
"output": {}
}
}
start_date = datetime(2022, 1, 1)
......@@ -106,9 +122,12 @@ while(current_date < end_date):
for tokenizer_name in TOKENIZERS:
for category in CATEGORIES:
counts[tokenizer_name][category]["day"][day_index] = 0
counts[tokenizer_name][category]["week"][week_index] = 0
counts[tokenizer_name][category]["month"][month_index] = 0
counts[tokenizer_name][category]["day"]["input"][day_index] = 0
counts[tokenizer_name][category]["day"]["output"][day_index] = 0
counts[tokenizer_name][category]["week"]["input"][week_index] = 0
counts[tokenizer_name][category]["week"]["output"][week_index] = 0
counts[tokenizer_name][category]["month"]["input"][month_index] = 0
counts[tokenizer_name][category]["month"]["output"][month_index] = 0
current_date += timedelta(days=1)
......@@ -118,6 +137,8 @@ for tokenizer_name in TOKENIZERS:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
output_token_num = 100
i = 1
for article in data:
print(f"Article N°{i}")
......@@ -129,7 +150,7 @@ for tokenizer_name in TOKENIZERS:
tokens = tokenizer(title+abstract, return_tensors="pt")
num_tokens = len(tokens["input_ids"][0])
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "NO KEYWORDS")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "NO KEYWORDS")
counts[tokenizer_name]["ALL"] += num_tokens
added = False
......@@ -144,46 +165,47 @@ for tokenizer_name in TOKENIZERS:
if added:
break
if mesh_term_present(article_mesh_terms, keyword):
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "KEYWORDS")
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SUBHEADINGS")
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SITE PROPOSITION")
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "KEYWORDS")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SUBHEADINGS")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SITE PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION")
added = True
for keyword in keywords_subheading_mesh_terms:
if added:
break
if mesh_term_present(article_mesh_terms, keyword):
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SUBHEADINGS")
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SITE PROPOSITION")
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SUBHEADINGS")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SITE PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION")
added = True
for keyword in keywords_site_proposition_mesh_terms:
if added:
break
if mesh_term_present(article_mesh_terms, keyword):
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "SITE PROPOSITION")
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "SITE PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION")
added = True
for keyword in keywords_proposition_mesh_terms:
if added:
break
if mesh_term_present(article_mesh_terms, keyword):
add_num_token(article_date, start_date, num_tokens, counts, tokenizer_name, "PROPOSITION")
add_token_number(article_date, start_date, num_tokens, output_token_num, counts, tokenizer_name, "PROPOSITION")
added = True
i += 1
for category in CATEGORIES:
for interval in INTERVALS:
counts[tokenizer_name][category][interval] = [val for _, val in counts[tokenizer_name][category][interval].items()]
for i in ["input", "output"]:
counts[tokenizer_name][category][interval][i] = [val for _, val in counts[tokenizer_name][category][interval][i].items()]
counts[tokenizer_name][category][interval] = {
"min": min(counts[tokenizer_name][category][interval]),
"max": max(counts[tokenizer_name][category][interval]),
"mean": statistics.mean(counts[tokenizer_name][category][interval])
counts[tokenizer_name][category][interval][i] = {
"min": min(counts[tokenizer_name][category][interval][i]),
"max": max(counts[tokenizer_name][category][interval][i]),
"mean": statistics.mean(counts[tokenizer_name][category][interval][i])
}
with open(f"{DOC_DIR}/token_count.json", "w") as json_file:
......
# Prices for 1M tokens
PRICES = {
'Mistral Large': { 'input': 2, 'output': 6},
'Mistral Small': { 'input': 0.1, 'output': 0.3},
'GPT-4o': { 'input': 0.1, 'output': 0.3},
'Mistral Small': { 'input': 0.1, 'output': 0.3},
'Mistral Small': { 'input': 0.1, 'output': 0.3},
'Mistral Small': { 'input': 0.1, 'output': 0.3},
'Mistral Small': { 'input': 0.1, 'output': 0.3},
'Mistral Small': { 'input': 0.1, 'output': 0.3},
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment