From 3f2aeaaa5cb588ccc36f8f54a75ceadab5db6974 Mon Sep 17 00:00:00 2001 From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch> Date: Fri, 14 Mar 2025 03:03:16 +0100 Subject: [PATCH] Start of the calcule to estimate the number of tokens for LLM price --- models/LLM/Tokenizer/doc/token_count.json | 20 +++++ models/LLM/Tokenizer/test.py | 28 ------ models/LLM/Tokenizer/token_count.py | 102 ++++++++++++++++++++++ 3 files changed, 122 insertions(+), 28 deletions(-) create mode 100644 models/LLM/Tokenizer/doc/token_count.json delete mode 100644 models/LLM/Tokenizer/test.py create mode 100644 models/LLM/Tokenizer/token_count.py diff --git a/models/LLM/Tokenizer/doc/token_count.json b/models/LLM/Tokenizer/doc/token_count.json new file mode 100644 index 000000000..28282848c --- /dev/null +++ b/models/LLM/Tokenizer/doc/token_count.json @@ -0,0 +1,20 @@ +{ + "bert-base-uncased": { + "day": { + "min": 0, + "max": 336390, + "mean": 55947.34222222222 + }, + "week": { + "min": 0, + "max": 610773, + "mean": 390936.39751552796 + }, + "month": { + "min": 149220, + "max": 1988608, + "mean": 1701101.6216216215 + }, + "ALL": 62940760 + } +} \ No newline at end of file diff --git a/models/LLM/Tokenizer/test.py b/models/LLM/Tokenizer/test.py deleted file mode 100644 index 9b6a6df9b..000000000 --- a/models/LLM/Tokenizer/test.py +++ /dev/null @@ -1,28 +0,0 @@ -from transformers import AutoTokenizer -import json -import sys -import os -import statistics -from datetime import datetime, timedelta - -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../"))) - -from testModel.utils import get_article_data - -DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data")) - -file_path = f"{DATA_DIR}/save_3_years.json" - -with open(file_path, "r", encoding="utf-8") as file: - data = json.load(file) - -tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - -for article in data: - - title, abstract = get_article_data(article) - - tokens = tokenizer(title+abstract, return_tensors="pt") - num_tokens = len(tokens["input_ids"][0]) - - print(f"Number of tokens: {num_tokens}") diff --git a/models/LLM/Tokenizer/token_count.py b/models/LLM/Tokenizer/token_count.py new file mode 100644 index 000000000..98f377612 --- /dev/null +++ b/models/LLM/Tokenizer/token_count.py @@ -0,0 +1,102 @@ +from transformers import AutoTokenizer +import json +import sys +import os +import statistics +from datetime import datetime, timedelta + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../"))) + +from testModel.utils import get_article_data + +DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data")) +DOC_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./doc")) + +INTERVALS = [ + "day", + "week", + "month" +] + +TOKENIZERS = [ + "bert-base-uncased" +] + +def get_date_indices(date, start_date): + delta_days = (date - start_date).days + day_index = delta_days + + week_index = (delta_days // 7) + + delta_months = (date.year - start_date.year) * 12 + (date.month - start_date.month) + month_index = delta_months + + return day_index, week_index, month_index + + +file_path = f"{DATA_DIR}/save_3_years.json" +with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + +print(len(data)) + +counts = {} + +for tokenizer_name in TOKENIZERS: + counts[tokenizer_name] = { + "day": {}, + "week": {}, + "month": {}, + "ALL": 0 + } + +start_date = datetime(2022, 1, 1) +end_date = datetime(2025, 1, 30) +current_date = start_date + +while(current_date < end_date): + day_index, week_index, month_index = get_date_indices(current_date, start_date) + + for tokenizer_name in TOKENIZERS: + counts[tokenizer_name]["day"][day_index] = 0 + counts[tokenizer_name]["week"][week_index] = 0 + counts[tokenizer_name]["month"][month_index] = 0 + + current_date += timedelta(days=1) + + + +for tokenizer_name in TOKENIZERS: + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + + i = 1 + for article in data: + print(f"Article N°{i}") + + article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"])) + title, abstract = get_article_data(article) + + tokens = tokenizer(title+abstract, return_tensors="pt") + num_tokens = len(tokens["input_ids"][0]) + + day_index, week_index, month_index = get_date_indices(article_date, start_date) + + counts[tokenizer_name]["day"][day_index] += num_tokens + counts[tokenizer_name]["week"][week_index] += num_tokens + counts[tokenizer_name]["month"][month_index] += num_tokens + counts[tokenizer_name]["ALL"] += num_tokens + + i += 1 + + for interval in INTERVALS: + counts[tokenizer_name][interval] = [val for _, val in counts[tokenizer_name][interval].items()] + + counts[tokenizer_name][interval] = { + "min": min(counts[tokenizer_name][interval]), + "max": max(counts[tokenizer_name][interval]), + "mean": statistics.mean(counts[tokenizer_name][interval]) + } + +with open(f"{DOC_DIR}/token_count.json", "w") as json_file: + json.dump(counts, json_file, indent=4) -- GitLab