Skip to content
Snippets Groups Projects
Commit e4a6a294 authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Test with ollama and tokenizer

parent 4b60c205
No related branches found
No related tags found
No related merge requests found
...@@ -14,10 +14,12 @@ NCDS = [ ...@@ -14,10 +14,12 @@ NCDS = [
] ]
def classify(model, sequence, labels): def classify(model, sequence, labels):
prompt = f'I need you to give me the labels that could be given to the text (keep in mind that u can put multiple labels and select only the labels that i give you):\
text: {sequence}\ prompt = 'I need you to give me the labels that could be given to the text (keep in mind that u can put multiple labels and select only the labels that i give you):\n'
labels: {labels}\ prompt += 'text: ' + sequence + '\n'
Give the response in json format "labels": [] with no text at all' prompt += 'labels: ' + "[" + ", ".join(labels) + ']\n'
prompt += 'Give the response in json format: { "labels": [] }. Follow this writing to the letter and don t add text around it. Only put the labels that you select between [].'
prompt += 'Even if you have no labels give me a response like: { "labels": [] }. And also put labels between \'\'.'
response: ChatResponse = chat(model=model, messages=[ response: ChatResponse = chat(model=model, messages=[
{ {
...@@ -26,11 +28,15 @@ def classify(model, sequence, labels): ...@@ -26,11 +28,15 @@ def classify(model, sequence, labels):
}, },
]) ])
print(response.message.content)
json_str = response.message.content.strip().lstrip('```json').strip() json_str = response.message.content.strip().lstrip('```json').strip()
if json_str.endswith("```"): if json_str.endswith("```"):
json_str = json_str[0:-3] json_str = json_str[0:-3]
json_str = json_str.split("}")[0] + "}"
responce_json = json.loads(json_str) responce_json = json.loads(json_str)
print(responce_json) print(responce_json)
...@@ -40,8 +46,4 @@ def classify(model, sequence, labels): ...@@ -40,8 +46,4 @@ def classify(model, sequence, labels):
for label in labels: for label in labels:
res[label] = label in responce_json["labels"] res[label] = label in responce_json["labels"]
return return res
\ No newline at end of file
text = "Theranostic drugs represent an emerging path to deliver on the promise of precision medicine. However, bottlenecks remain in characterizing theranostic targets, identifying theranostic lead compounds, and tailoring theranostic drugs. To overcome these bottlenecks, we present the Theranostic Genome, the part of the human genome whose expression can be utilized to combine therapeutic and diagnostic applications. Using a deep learning-based hybrid human-AI pipeline that cross-references PubMed, the Gene Expression Omnibus, DisGeNET, The Cancer Genome Atlas and the NIH Molecular Imaging and Contrast Agent Database, we bridge individual genes in human cancers with respective theranostic compounds. Cross-referencing the Theranostic Genome with RNAseq data from over 17'000 human tissues identifies theranostic targets and lead compounds for various human cancers, and allows tailoring targeted theranostics to relevant cancer subpopulations. We expect the Theranostic Genome to facilitate the development of new targeted theranostics to better diagnose, understand, treat, and monitor a variety of human cancers."
classify('llama3.2', text, NCDS)
\ No newline at end of file
from transformers import AutoTokenizer from transformers import AutoTokenizer
import json
import sys
import os
import statistics
from datetime import datetime, timedelta
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
from testModel.utils import get_article_data
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
file_path = f"{DATA_DIR}/save_3_years.json"
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Your text for article in data:
text = "Hugging Face makes NLP easy!"
title, abstract = get_article_data(article)
# Tokenize and count tokens tokens = tokenizer(title+abstract, return_tensors="pt")
tokens = tokenizer(text, return_tensors="pt") # You can also use return_tensors="tf" or "np" num_tokens = len(tokens["input_ids"][0])
num_tokens = len(tokens["input_ids"][0])
print(f"Number of tokens: {num_tokens}") print(f"Number of tokens: {num_tokens}")
\ No newline at end of file
import argparse
import os
import sys
PROJECT_PWD = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
DEST_PATH = f"/home/guest/Documents/NCD-Project"
commands = [
f"rclone copy {PROJECT_PWD}/models anthoine:{DEST_PATH}/models -P",
f"rclone copy {PROJECT_PWD}/testModel anthoine:{DEST_PATH}/testModel -P",
f"rclone copy {PROJECT_PWD}/parsers anthoine:{DEST_PATH}/parsers -P",
f"rclone copy {PROJECT_PWD}/variables anthoine:{DEST_PATH}/variables -P"
]
for cmd in commands:
print(f"-> Running: {cmd}")
os.system(cmd)
print(f"Files successfully copied to anthoine:{DEST_PATH}")
\ No newline at end of file
...@@ -8,7 +8,7 @@ import argparse ...@@ -8,7 +8,7 @@ import argparse
# Ajouter le répertoire parent au chemin de recherche # Ajouter le répertoire parent au chemin de recherche
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
from variables.diseases import DISEASES_LABELS from variables.pubmed import NCDS
from variables.huggingface import HUGGINGFACE_MODELS from variables.huggingface import HUGGINGFACE_MODELS
from variables.articles import LENGTH_CATEGORIES, LENGTH_CATEGORIES_TRESHOLDS from variables.articles import LENGTH_CATEGORIES, LENGTH_CATEGORIES_TRESHOLDS
from variables.models import MODELS from variables.models import MODELS
...@@ -80,10 +80,10 @@ length_matrix = {} ...@@ -80,10 +80,10 @@ length_matrix = {}
for length_category in LENGTH_CATEGORIES: for length_category in LENGTH_CATEGORIES:
length_matrix[length_category] = data["results"][length_category]["confusion matrix"] length_matrix[length_category] = data["results"][length_category]["confusion matrix"]
for disease_label in DISEASES_LABELS: for ncd in NCDS:
try: try:
filename = get_dataset_filename(disease_label) filename = get_dataset_filename(ncd)
articles = parseJsonFile(f"{DATASET_DIR}/{filename}.json") articles = parseJsonFile(f"{DATASET_DIR}/{filename}.json")
except Exception as e: except Exception as e:
print(f"Error: {e}") print(f"Error: {e}")
...@@ -101,14 +101,14 @@ for disease_label in DISEASES_LABELS: ...@@ -101,14 +101,14 @@ for disease_label in DISEASES_LABELS:
print("---------------------------------") print("---------------------------------")
title, abstract = get_article_data(article) title, abstract = get_article_data(article)
wanted = get_wanted_predictions(article, DISEASES_LABELS) wanted = get_wanted_predictions(article, NCDS)
start = time.time() start = time.time()
if MODELS[model]["isHuggingFace"]: if MODELS[model]["isHuggingFace"]:
predictions = MODELS[model]["predict"](pipline, title+abstract, DISEASES_LABELS, data["treshold"]) predictions = MODELS[model]["predict"](pipline, title+abstract, NCDS, data["treshold"])
else: else:
predictions = MODELS[model]["predict"](model, title+abstract, DISEASES_LABELS) predictions = MODELS[model]["predict"](model, title+abstract, NCDS)
end = time.time() end = time.time()
......
...@@ -5,7 +5,7 @@ import os ...@@ -5,7 +5,7 @@ import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
from models.ZeroShotClassifier.HuggingFace.zero_shot_classification import create_classifier, classify from models.ZeroShotClassifier.HuggingFace.zero_shot_classification import create_classifier, classify
import models.LLM.Ollama as ollama import models.LLM.Ollama.ollama_wrapper as ollama
MODELS = { MODELS = {
'facebook/bart-large-mnli': {'predict': classify, 'isHuggingFace': True, 'pipline': create_classifier}, 'facebook/bart-large-mnli': {'predict': classify, 'isHuggingFace': True, 'pipline': create_classifier},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment