Skip to content
Snippets Groups Projects
Commit e4a6a294 authored by Ivan Pavlovich's avatar Ivan Pavlovich
Browse files

Test with ollama and tokenizer

parent 4b60c205
Branches
No related tags found
No related merge requests found
......@@ -14,10 +14,12 @@ NCDS = [
]
def classify(model, sequence, labels):
prompt = f'I need you to give me the labels that could be given to the text (keep in mind that u can put multiple labels and select only the labels that i give you):\
text: {sequence}\
labels: {labels}\
Give the response in json format "labels": [] with no text at all'
prompt = 'I need you to give me the labels that could be given to the text (keep in mind that u can put multiple labels and select only the labels that i give you):\n'
prompt += 'text: ' + sequence + '\n'
prompt += 'labels: ' + "[" + ", ".join(labels) + ']\n'
prompt += 'Give the response in json format: { "labels": [] }. Follow this writing to the letter and don t add text around it. Only put the labels that you select between [].'
prompt += 'Even if you have no labels give me a response like: { "labels": [] }. And also put labels between \'\'.'
response: ChatResponse = chat(model=model, messages=[
{
......@@ -26,11 +28,15 @@ def classify(model, sequence, labels):
},
])
print(response.message.content)
json_str = response.message.content.strip().lstrip('```json').strip()
if json_str.endswith("```"):
json_str = json_str[0:-3]
json_str = json_str.split("}")[0] + "}"
responce_json = json.loads(json_str)
print(responce_json)
......@@ -40,8 +46,4 @@ def classify(model, sequence, labels):
for label in labels:
res[label] = label in responce_json["labels"]
return
text = "Theranostic drugs represent an emerging path to deliver on the promise of precision medicine. However, bottlenecks remain in characterizing theranostic targets, identifying theranostic lead compounds, and tailoring theranostic drugs. To overcome these bottlenecks, we present the Theranostic Genome, the part of the human genome whose expression can be utilized to combine therapeutic and diagnostic applications. Using a deep learning-based hybrid human-AI pipeline that cross-references PubMed, the Gene Expression Omnibus, DisGeNET, The Cancer Genome Atlas and the NIH Molecular Imaging and Contrast Agent Database, we bridge individual genes in human cancers with respective theranostic compounds. Cross-referencing the Theranostic Genome with RNAseq data from over 17'000 human tissues identifies theranostic targets and lead compounds for various human cancers, and allows tailoring targeted theranostics to relevant cancer subpopulations. We expect the Theranostic Genome to facilitate the development of new targeted theranostics to better diagnose, understand, treat, and monitor a variety of human cancers."
classify('llama3.2', text, NCDS)
\ No newline at end of file
return res
\ No newline at end of file
from transformers import AutoTokenizer
import json
import sys
import os
import statistics
from datetime import datetime, timedelta
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
from testModel.utils import get_article_data
DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../dataSources/PubMed/data"))
file_path = f"{DATA_DIR}/save_3_years.json"
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
# Choose a tokenizer (e.g., GPT-2, BERT, T5, etc.)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Your text
text = "Hugging Face makes NLP easy!"
for article in data:
title, abstract = get_article_data(article)
# Tokenize and count tokens
tokens = tokenizer(text, return_tensors="pt") # You can also use return_tensors="tf" or "np"
num_tokens = len(tokens["input_ids"][0])
tokens = tokenizer(title+abstract, return_tensors="pt")
num_tokens = len(tokens["input_ids"][0])
print(f"Number of tokens: {num_tokens}")
\ No newline at end of file
print(f"Number of tokens: {num_tokens}")
import argparse
import os
import sys
PROJECT_PWD = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
DEST_PATH = f"/home/guest/Documents/NCD-Project"
commands = [
f"rclone copy {PROJECT_PWD}/models anthoine:{DEST_PATH}/models -P",
f"rclone copy {PROJECT_PWD}/testModel anthoine:{DEST_PATH}/testModel -P",
f"rclone copy {PROJECT_PWD}/parsers anthoine:{DEST_PATH}/parsers -P",
f"rclone copy {PROJECT_PWD}/variables anthoine:{DEST_PATH}/variables -P"
]
for cmd in commands:
print(f"-> Running: {cmd}")
os.system(cmd)
print(f"Files successfully copied to anthoine:{DEST_PATH}")
\ No newline at end of file
......@@ -8,7 +8,7 @@ import argparse
# Ajouter le répertoire parent au chemin de recherche
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
from variables.diseases import DISEASES_LABELS
from variables.pubmed import NCDS
from variables.huggingface import HUGGINGFACE_MODELS
from variables.articles import LENGTH_CATEGORIES, LENGTH_CATEGORIES_TRESHOLDS
from variables.models import MODELS
......@@ -80,10 +80,10 @@ length_matrix = {}
for length_category in LENGTH_CATEGORIES:
length_matrix[length_category] = data["results"][length_category]["confusion matrix"]
for disease_label in DISEASES_LABELS:
for ncd in NCDS:
try:
filename = get_dataset_filename(disease_label)
filename = get_dataset_filename(ncd)
articles = parseJsonFile(f"{DATASET_DIR}/{filename}.json")
except Exception as e:
print(f"Error: {e}")
......@@ -101,14 +101,14 @@ for disease_label in DISEASES_LABELS:
print("---------------------------------")
title, abstract = get_article_data(article)
wanted = get_wanted_predictions(article, DISEASES_LABELS)
wanted = get_wanted_predictions(article, NCDS)
start = time.time()
if MODELS[model]["isHuggingFace"]:
predictions = MODELS[model]["predict"](pipline, title+abstract, DISEASES_LABELS, data["treshold"])
predictions = MODELS[model]["predict"](pipline, title+abstract, NCDS, data["treshold"])
else:
predictions = MODELS[model]["predict"](model, title+abstract, DISEASES_LABELS)
predictions = MODELS[model]["predict"](model, title+abstract, NCDS)
end = time.time()
......
......@@ -5,7 +5,7 @@ import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
from models.ZeroShotClassifier.HuggingFace.zero_shot_classification import create_classifier, classify
import models.LLM.Ollama as ollama
import models.LLM.Ollama.ollama_wrapper as ollama
MODELS = {
'facebook/bart-large-mnli': {'predict': classify, 'isHuggingFace': True, 'pipline': create_classifier},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment