From 90b0177bfa6abf69d9a19f44473cf4a358b4808d Mon Sep 17 00:00:00 2001 From: Ivan Pavlovich <ivan.pavlovic@hes-so.ch> Date: Wed, 12 Mar 2025 20:23:48 +0100 Subject: [PATCH] Starting to try to finetune a zero-shot for my labels --- .../__pycache__/pubmedApi.cpython-313.pyc | Bin 6743 -> 6559 bytes dataSources/PubMed/pubmedApi.py | 2 - models/FineTuning/facebook-bart-large-mnli.py | 67 ++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 models/FineTuning/facebook-bart-large-mnli.py diff --git a/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc b/dataSources/PubMed/__pycache__/pubmedApi.cpython-313.pyc index 9d5fa7641db53dafd2f09601a95d99a1e374299b..ff19b58d5a7d6f2c548d646fb5d9087fe9ffacd0 100644 GIT binary patch delta 281 zcmca^GT)f*GcPX}0|NuY${QEc7f5d8b7o?6+Z@D{!Nll2c{htEqvB>(*0W5E36pbK zr6<2*Z`pj0V*``E5CcP~U?{Uy1Or1N3j>1!gFCaYBS<XRQkR#3Ay3eq*;j>ufg2*@ zH+e7jcBTr3$vQk9jNX&?^T;y_ZobbW&&n9O`M=-`X4VfJ3?^2S1;xZS`-|;k<OyVA zkXHW4&LANCEo!o)gb`!<WFHC7s1hayKE*4%3Ln@Q1VnxndoVCC98}>BGGjPo#vLrp za9Bzvgn{{zh<Jz)^CdH`U{U7F4BWv2%$EgNL2PNRU@PX!s@%b*%$H4BgRPjaFmQ$N avtJS73Q^enULu~6(~*f$vM7#$fdK&4V@50h delta 395 zcmbPleBFfaGcPX}0|Ntt`>~7ZQ>8ZYIWsYC+8o4`!Nk}&c{htEWB6uP*0W5E(<bM# zN>6^r-ZGhoO=|KxcAm*$>{63`*n}p(W9Q}#mSbgLU<efsR#s<V*j&J|iAmpsfgw~d zl-VkRfgzEFfkA=6o!Pe>Bo=I`%gex!C+N=X8^yrD4UuV?9LT+$X&uAl1Kb{pjSLKm z3_1)NAZIZ!Fr+XtFr+alFvv3~fE2U%xo<A!k!NLGxOuYR3udh^Yz%4!8`v(Gdt5Z| zJfV5fAml=5{6&L=3yJv`4GOLlm0njZyQo@r!`O0igQ&Q`g_7!v#x)CA7DQcAFnC~W zxw%2~DkD!Q6N9w!M|K7Q;cs!16UB`fvnNjx_l&AwV&GG}!mIFsjX^-<SFr~J1H(ZT z?qD5;LuT9|f((bHWJ1iCFLQ~9axh=k;R@kqzHG)F!p3}sjWvXu`HCP{hyn8zY3>kh j<}2E)AhsD-C@cF_4z5tK&E^vEjGWF)jFLrh3=9kaO0a1^ diff --git a/dataSources/PubMed/pubmedApi.py b/dataSources/PubMed/pubmedApi.py index f694ee56e..bf08b5d9f 100644 --- a/dataSources/PubMed/pubmedApi.py +++ b/dataSources/PubMed/pubmedApi.py @@ -106,8 +106,6 @@ def getPubmedData(term, date_min, date_max, nb_items = -1, debug = False, store for part in entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]: if "#text" in part: data["Abstract"] += part["#text"] - elif not isinstance(entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"], str): - data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]["#text"] else: data["Abstract"] = entrie["MedlineCitation"]["Article"]["Abstract"]["AbstractText"] diff --git a/models/FineTuning/facebook-bart-large-mnli.py b/models/FineTuning/facebook-bart-large-mnli.py new file mode 100644 index 000000000..969c5d7e1 --- /dev/null +++ b/models/FineTuning/facebook-bart-large-mnli.py @@ -0,0 +1,67 @@ +# https://medium.com/@lidores98/finetuning-huggingface-facebook-bart-model-2c758472e340 +# Copied code: (need to understand and modify it for my usage) + +import pandas as pd +import torch +from datasets import Dataset, load_metric +import random +from transformers import BartTokenizerFast +from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction +import numpy as np +from transformers import pipeline + +# Split to train and test portions +df_train = df.head(train_portion) +df_test = df.tail(test_portion) +# Convert to Dataset objects +train_ds = Dataset.from_pandas(df_train, split="train") +test_ds = Dataset.from_pandas(df_test, split="test") + +tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli') + +def create_input_sequence(sample): + text = sample["text"] + label = sample["class"][0] + contradiction_label = random.choice([x for x in label_to_int if x != label]) + encoded_sequence = tokenizer(text * 2, [template.format(label), template.format(contradiction_label)], truncation = True, padding = 'max_length') + encoded_sequence["labels"] = [2, 0] + encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids) + return encoded_sequence + + +train_dataset = train_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"]) +test_dataset = test_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"]) + +def compute_metrics(p: EvalPrediction): + metric_acc = load_metric("accuracy") + metric_f1 = load_metric("f1") + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.argmax(preds, axis = 1) + result = {} + result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"] + result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"] + return result + +training_args = TrainingArguments( + output_dir = model_directory, # Output directory + num_train_epochs = 32, # Total number of training epochs + per_device_train_batch_size = 16, # Batch size per device during training + per_device_eval_batch_size = 64, # Batch size for evaluation + warmup_steps = 500, # Number of warmup steps for learning rate scheduler + weight_decay = 0.01, # Strength of weight decay +) + +model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(label_to_int), ignore_mismatched_sizes = True) + +trainer = Trainer( + model = model, # The instantiated model to be trained + args = training_args, # Training arguments, defined above + compute_metrics = compute_metrics, # A function to compute the metrics + train_dataset = train_dataset, # Training dataset + eval_dataset = test_dataset, # Evaluation dataset + tokenizer = tokenizer # The tokenizer that was used +) + +classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer, device = 0) + +classifier(sequences, label_to_int, multi_label=False) \ No newline at end of file -- GitLab