Test with ollama and tokenizer

e4a6a294 · Ivan Pavlovich · 4b60c205 · e4a6a294 · e4a6a294 · e4a6a294
Commit e4a6a294 authored 1 month ago by Ivan Pavlovich
--- a/models/LLM/Ollama/ollama_wrapper.py
+++ b/models/LLM/Ollama/ollama_wrapper.py
@@ -14,10 +14,12 @@ NCDS = [
 ]

 def classify(model, sequence, labels):
-    prompt = f'I need you to give me the labels that could be given to the text (keep in mind that u can put multiple labels and select only the labels that i give you):\
-        text: {sequence}\
-        labels: {labels}\
-        Give the response in json format "labels": [] with no text at all'
+
+    prompt = 'I need you to give me the labels that could be given to the text (keep in mind that u can put multiple labels and select only the labels that i give you):\n'
+    prompt += 'text: ' + sequence + '\n'
+    prompt += 'labels: ' + "[" + ", ".join(labels) + ']\n'
+    prompt += 'Give the response in json format: { "labels": [] }. Follow this writing to the letter and don t add text around it. Only put the labels that you select between [].'
+    prompt += 'Even if you have no labels give me a response like: { "labels": [] }. And also put labels between \'\'.'

    response: ChatResponse = chat(model=model, messages=[
    {
@@ -26,11 +28,15 @@ def classify(model, sequence, labels):
    },
    ])

+    print(response.message.content)
+
    json_str = response.message.content.strip().lstrip('```json').strip()

    if json_str.endswith("```"):
        json_str = json_str[0:-3]

+    json_str = json_str.split("}")[0] + "}"
+
    responce_json = json.loads(json_str)

    print(responce_json)
@@ -40,8 +46,4 @@ def classify(model, sequence, labels):
    for label in labels:
        res[label] = label in responce_json["labels"]
            
-    return 
-
-text = "Theranostic drugs represent an emerging path to deliver on the promise of precision medicine. However, bottlenecks remain in characterizing theranostic targets, identifying theranostic lead compounds, and tailoring theranostic drugs. To overcome these bottlenecks, we present the Theranostic Genome, the part of the human genome whose expression can be utilized to combine therapeutic and diagnostic applications. Using a deep learning-based hybrid human-AI pipeline that cross-references PubMed, the Gene Expression Omnibus, DisGeNET, The Cancer Genome Atlas and the NIH Molecular Imaging and Contrast Agent Database, we bridge individual genes in human cancers with respective theranostic compounds. Cross-referencing the Theranostic Genome with RNAseq data from over 17'000 human tissues identifies theranostic targets and lead compounds for various human cancers, and allows tailoring targeted theranostics to relevant cancer subpopulations. We expect the Theranostic Genome to facilitate the development of new targeted theranostics to better diagnose, understand, treat, and monitor a variety of human cancers."
-
-classify('llama3.2', text, NCDS)
\ No newline at end of file
+    return res
\ No newline at end of file
--- a/models/LLM/Tokenizer/test.py
+++ b/models/LLM/Tokenizer/test.py
 from transformers import AutoTokenizer
+import json
+import sys
+import os
+import statistics
+from datetime import datetime, timedelta
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+from testModel.utils import get_article_data
+
+DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
+
+file_path = f"{DATA_DIR}/save_3_years.json"
+
+with open(file_path, "r", encoding="utf-8") as file:
+    data = json.load(file)

-# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.)
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

-# Your text
-text = "Hugging Face makes NLP easy!"
+for article in data:
+
+    title, abstract = get_article_data(article)

-# Tokenize and count tokens
-tokens = tokenizer(text, return_tensors="pt")  # You can also use return_tensors="tf" or "np"
-num_tokens = len(tokens["input_ids"][0])
+    tokens = tokenizer(title+abstract, return_tensors="pt")
+    num_tokens = len(tokens["input_ids"][0])

-print(f"Number of tokens: {num_tokens}")
\ No newline at end of file
+    print(f"Number of tokens: {num_tokens}")
--- a/scripts/copy_files_ollama.py
+++ b/scripts/copy_files_ollama.py
+import argparse
+import os
+import sys
+
+
+PROJECT_PWD = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
+DEST_PATH = f"/home/guest/Documents/NCD-Project"
+
+commands = [
+    f"rclone copy {PROJECT_PWD}/models anthoine:{DEST_PATH}/models -P",
+    f"rclone copy {PROJECT_PWD}/testModel anthoine:{DEST_PATH}/testModel -P",
+    f"rclone copy {PROJECT_PWD}/parsers anthoine:{DEST_PATH}/parsers -P",
+    f"rclone copy {PROJECT_PWD}/variables anthoine:{DEST_PATH}/variables -P"
+]
+
+for cmd in commands:
+    print(f"-> Running: {cmd}")
+    os.system(cmd)
+
+print(f"Files successfully copied to anthoine:{DEST_PATH}")
\ No newline at end of file
--- a/testModel/test.py
+++ b/testModel/test.py
@@ -8,7 +8,7 @@ import argparse
 # Ajouter le répertoire parent au chemin de recherche
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))

-from variables.diseases import DISEASES_LABELS
+from variables.pubmed import NCDS
 from variables.huggingface import HUGGINGFACE_MODELS
 from variables.articles import LENGTH_CATEGORIES, LENGTH_CATEGORIES_TRESHOLDS
 from variables.models import MODELS
@@ -80,10 +80,10 @@ length_matrix = {}
 for length_category in LENGTH_CATEGORIES:
    length_matrix[length_category] = data["results"][length_category]["confusion matrix"]

-for disease_label in DISEASES_LABELS:
+for ncd in NCDS:

    try:
-        filename = get_dataset_filename(disease_label)
+        filename = get_dataset_filename(ncd)
        articles = parseJsonFile(f"{DATASET_DIR}/{filename}.json")
    except Exception as e:
        print(f"Error: {e}")
@@ -101,14 +101,14 @@ for disease_label in DISEASES_LABELS:
        print("---------------------------------")

        title, abstract = get_article_data(article)
-        wanted = get_wanted_predictions(article, DISEASES_LABELS)
+        wanted = get_wanted_predictions(article, NCDS)

        start = time.time()

        if MODELS[model]["isHuggingFace"]:
-            predictions = MODELS[model]["predict"](pipline, title+abstract, DISEASES_LABELS, data["treshold"])
+            predictions = MODELS[model]["predict"](pipline, title+abstract, NCDS, data["treshold"])
        else:
-            predictions = MODELS[model]["predict"](model, title+abstract, DISEASES_LABELS)
+            predictions = MODELS[model]["predict"](model, title+abstract, NCDS)

        end = time.time()


--- a/variables/models.py
+++ b/variables/models.py
@@ -5,7 +5,7 @@ import os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))

 from models.ZeroShotClassifier.HuggingFace.zero_shot_classification import create_classifier, classify
-import models.LLM.Ollama as ollama
+import models.LLM.Ollama.ollama_wrapper as ollama

 MODELS = {
    'facebook/bart-large-mnli': {'predict': classify, 'isHuggingFace': True, 'pipline': create_classifier},