Skip to content
Snippets Groups Projects
Commit 3f2aeaaa authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Start of the calcule to estimate the number of tokens for LLM price

parent a00eaaef
Branches
No related tags found
No related merge requests found
{
"bert-base-uncased": {
"day": {
"min": 0,
"max": 336390,
"mean": 55947.34222222222
},
"week": {
"min": 0,
"max": 610773,
"mean": 390936.39751552796
},
"month": {
"min": 149220,
"max": 1988608,
"mean": 1701101.6216216215
},
"ALL": 62940760
}
}
\ No newline at end of file
from transformers import AutoTokenizer
import json
import sys
import os
import statistics
from datetime import datetime, timedelta
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
from testModel.utils import get_article_data
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
file_path = f"{DATA_DIR}/save_3_years.json"
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
for article in data:
title, abstract = get_article_data(article)
tokens = tokenizer(title+abstract, return_tensors="pt")
num_tokens = len(tokens["input_ids"][0])
print(f"Number of tokens: {num_tokens}")
from transformers import AutoTokenizer
import json
import sys
import os
import statistics
from datetime import datetime, timedelta
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
from testModel.utils import get_article_data
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
DOC_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./doc"))
INTERVALS = [
"day",
"week",
"month"
]
TOKENIZERS = [
"bert-base-uncased"
]
def get_date_indices(date, start_date):
delta_days = (date - start_date).days
day_index = delta_days
week_index = (delta_days // 7)
delta_months = (date.year - start_date.year) * 12 + (date.month - start_date.month)
month_index = delta_months
return day_index, week_index, month_index
file_path = f"{DATA_DIR}/save_3_years.json"
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
print(len(data))
counts = {}
for tokenizer_name in TOKENIZERS:
counts[tokenizer_name] = {
"day": {},
"week": {},
"month": {},
"ALL": 0
}
start_date = datetime(2022, 1, 1)
end_date = datetime(2025, 1, 30)
current_date = start_date
while(current_date < end_date):
day_index, week_index, month_index = get_date_indices(current_date, start_date)
for tokenizer_name in TOKENIZERS:
counts[tokenizer_name]["day"][day_index] = 0
counts[tokenizer_name]["week"][week_index] = 0
counts[tokenizer_name]["month"][month_index] = 0
current_date += timedelta(days=1)
for tokenizer_name in TOKENIZERS:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
i = 1
for article in data:
print(f"Article N°{i}")
article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"]))
title, abstract = get_article_data(article)
tokens = tokenizer(title+abstract, return_tensors="pt")
num_tokens = len(tokens["input_ids"][0])
day_index, week_index, month_index = get_date_indices(article_date, start_date)
counts[tokenizer_name]["day"][day_index] += num_tokens
counts[tokenizer_name]["week"][week_index] += num_tokens
counts[tokenizer_name]["month"][month_index] += num_tokens
counts[tokenizer_name]["ALL"] += num_tokens
i += 1
for interval in INTERVALS:
counts[tokenizer_name][interval] = [val for _, val in counts[tokenizer_name][interval].items()]
counts[tokenizer_name][interval] = {
"min": min(counts[tokenizer_name][interval]),
"max": max(counts[tokenizer_name][interval]),
"mean": statistics.mean(counts[tokenizer_name][interval])
}
with open(f"{DOC_DIR}/token_count.json", "w") as json_file:
json.dump(counts, json_file, indent=4)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment