From 3f2aeaaa5cb588ccc36f8f54a75ceadab5db6974 Mon Sep 17 00:00:00 2001
From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch>
Date: Fri, 14 Mar 2025 03:03:16 +0100
Subject: [PATCH] Start of the calcule to estimate the number of tokens for LLM
 price

---
 models/LLM/Tokenizer/doc/token_count.json |  20 +++++
 models/LLM/Tokenizer/test.py              |  28 ------
 models/LLM/Tokenizer/token_count.py       | 102 ++++++++++++++++++++++
 3 files changed, 122 insertions(+), 28 deletions(-)
 create mode 100644 models/LLM/Tokenizer/doc/token_count.json
 delete mode 100644 models/LLM/Tokenizer/test.py
 create mode 100644 models/LLM/Tokenizer/token_count.py

diff --git a/models/LLM/Tokenizer/doc/token_count.json b/models/LLM/Tokenizer/doc/token_count.json
new file mode 100644
index 000000000..28282848c
--- /dev/null
+++ b/models/LLM/Tokenizer/doc/token_count.json
@@ -0,0 +1,20 @@
+{
+    "bert-base-uncased": {
+        "day": {
+            "min": 0,
+            "max": 336390,
+            "mean": 55947.34222222222
+        },
+        "week": {
+            "min": 0,
+            "max": 610773,
+            "mean": 390936.39751552796
+        },
+        "month": {
+            "min": 149220,
+            "max": 1988608,
+            "mean": 1701101.6216216215
+        },
+        "ALL": 62940760
+    }
+}
\ No newline at end of file
diff --git a/models/LLM/Tokenizer/test.py b/models/LLM/Tokenizer/test.py
deleted file mode 100644
index 9b6a6df9b..000000000
--- a/models/LLM/Tokenizer/test.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from transformers import AutoTokenizer
-import json
-import sys
-import os
-import statistics
-from datetime import datetime, timedelta
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
-
-from testModel.utils import get_article_data
-
-DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
-
-file_path = f"{DATA_DIR}/save_3_years.json"
-
-with open(file_path, "r", encoding="utf-8") as file:
-    data = json.load(file)
-
-tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-
-for article in data:
-
-    title, abstract = get_article_data(article)
-
-    tokens = tokenizer(title+abstract, return_tensors="pt")
-    num_tokens = len(tokens["input_ids"][0])
-
-    print(f"Number of tokens: {num_tokens}")
diff --git a/models/LLM/Tokenizer/token_count.py b/models/LLM/Tokenizer/token_count.py
new file mode 100644
index 000000000..98f377612
--- /dev/null
+++ b/models/LLM/Tokenizer/token_count.py
@@ -0,0 +1,102 @@
+from transformers import AutoTokenizer
+import json
+import sys
+import os
+import statistics
+from datetime import datetime, timedelta
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+from testModel.utils import get_article_data
+
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
+DOC_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "./doc"))
+
+INTERVALS = [
+    "day",
+    "week",
+    "month"
+]
+
+TOKENIZERS = [
+    "bert-base-uncased"
+]
+
+def get_date_indices(date, start_date):
+    delta_days = (date - start_date).days
+    day_index = delta_days
+
+    week_index = (delta_days // 7)
+
+    delta_months = (date.year - start_date.year) * 12 + (date.month - start_date.month)
+    month_index = delta_months
+
+    return day_index, week_index, month_index
+
+
+file_path = f"{DATA_DIR}/save_3_years.json"
+with open(file_path, "r", encoding="utf-8") as file:
+    data = json.load(file)
+
+print(len(data))
+
+counts = {}
+
+for tokenizer_name in TOKENIZERS:
+    counts[tokenizer_name] = {
+        "day": {},
+        "week": {},
+        "month": {},
+        "ALL": 0
+    }
+
+start_date = datetime(2022, 1, 1)
+end_date = datetime(2025, 1, 30)
+current_date = start_date
+
+while(current_date < end_date):
+    day_index, week_index, month_index = get_date_indices(current_date, start_date)
+
+    for tokenizer_name in TOKENIZERS:
+        counts[tokenizer_name]["day"][day_index] = 0
+        counts[tokenizer_name]["week"][week_index] = 0
+        counts[tokenizer_name]["month"][month_index] = 0
+
+    current_date += timedelta(days=1)
+
+
+
+for tokenizer_name in TOKENIZERS:
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    i = 1
+    for article in data:
+        print(f"Article N°{i}")
+
+        article_date = datetime(int(article["Date"]["Year"]), int(article["Date"]["Month"]), int(article["Date"]["Day"]))
+        title, abstract = get_article_data(article)
+
+        tokens = tokenizer(title+abstract, return_tensors="pt")
+        num_tokens = len(tokens["input_ids"][0])
+
+        day_index, week_index, month_index = get_date_indices(article_date, start_date)
+
+        counts[tokenizer_name]["day"][day_index] += num_tokens
+        counts[tokenizer_name]["week"][week_index] += num_tokens
+        counts[tokenizer_name]["month"][month_index] += num_tokens
+        counts[tokenizer_name]["ALL"] += num_tokens
+
+        i += 1
+
+    for interval in INTERVALS:
+        counts[tokenizer_name][interval] = [val for _, val in counts[tokenizer_name][interval].items()]
+
+        counts[tokenizer_name][interval] = {
+            "min": min(counts[tokenizer_name][interval]),
+            "max": max(counts[tokenizer_name][interval]),
+            "mean": statistics.mean(counts[tokenizer_name][interval])
+        }
+
+with open(f"{DOC_DIR}/token_count.json", "w") as json_file:
+    json.dump(counts, json_file, indent=4)
-- 
GitLab