基于T5的模型微调以及对应的数据介绍
基于T5模型实现多任务训练
·
对于配置较低的GPU上的T5微调训练,只能单条数据的训练,训练代码如下:
# 使用
import datetime
import json
import os
import transformers
from torch.utils.tensorboard import SummaryWriter
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
def preprocess(text):
text = text.replace("\n", "\\n").replace("\t", "\\t")
return text
def postprocess(text):
return text.replace("\\n", "\n").replace("\\t", "\t")
def train():
lr = 1.5e-4
num_warmup_steps = 2000
epochs = 3
tb_writer = SummaryWriter(log_dir="t5/summary")
output_dir = "t5/my_model/"
batch_size = 1
gradient_accumulation=1
max_grad_norm = 1
log_step = 1
import pandas as pd
colum_data = pd.read_excel("data/rewrite_train.xlsx")
data_json_list = json.loads(colum_data.to_json(force_ascii=False, orient="records"))
total_steps = int(len(data_json_list) / epochs/ batch_size / gradient_accumulation)
if not os.path.exists(output_dir):
os.mkdir(output_dir)
tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
# 修改colab笔记本设置为gpu,推理更快
model.train()
device = torch.device('cuda')
model.to(device)
print('calculating total steps')
optimizer = transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=num_warmup_steps,
num_training_steps=total_steps)
print('starting training')
overall_step = 0
running_loss = 0
for epoch in range(epochs):
print('epoch {}'.format(epoch + 1))
now = datetime.datetime.now()
print('time: {}'.format(now))
import random
random.shuffle(data_json_list)
for step, each in enumerate(data_json_list):
input_ids = tokenizer(preprocess(each.get("input")), return_tensors="pt").input_ids.long().to(device)
labels = tokenizer(preprocess(each.get("label")), return_tensors="pt").input_ids.long().to(device)
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
if gradient_accumulation > 1:
loss = loss / gradient_accumulation
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
# optimizer step
if (step + 1) % gradient_accumulation == 0:
running_loss += loss.item()
optimizer.step()
optimizer.zero_grad()
scheduler.step()
overall_step += 1
if (overall_step + 1) % log_step == 0:
tb_writer.add_scalar('loss', loss.item(), overall_step)
if (overall_step + 1) % log_step == 0:
print('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
datetime.datetime.now().hour,
datetime.datetime.now().minute,
step + 1,
epoch + 1,
running_loss / log_step))
running_loss = 0
if step%10==0 and step>=10:
if not os.path.exists(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step)):
os.mkdir(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
print('saving model for epoch {}, step {}'.format(epoch + 1,step))
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir + 'model_epoch{}_step{}'.format(epoch + 1,step))
print('saving model for epoch {}'.format(epoch + 1))
if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)):
os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1))
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1))
print('epoch {} finished'.format(epoch + 1))
then = datetime.datetime.now()
print('time: {}'.format(then))
print('time for one epoch: {}'.format(then - now))
print('training finished')
if not os.path.exists(output_dir + 'final_model'):
os.mkdir(output_dir + 'final_model')
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir + 'final_model')
print("begin train now")
train()
print("train end")
如果你是土豪,可以使用批量的训练方法:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/2/27 16:39
# preference:https://github.com/Shivanandroy/T5-Finetuning-PyTorch
#数据下载:链接:https://pan.baidu.com/s/1cwKLNZD7-rsdETogacP2jw?pwd=mefc 提取码:mefc
# @Author : sparkle_code_guy
import os
from torch.utils.tensorboard import SummaryWriter
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from torch import cuda
import numpy as np
import pandas as pd
device = 'cuda' if cuda.is_available() else 'cpu'
class YourDataSetClass(Dataset):
"""
Creating a custom dataset for reading the dataset and
loading it into the dataloader to pass it to the
neural network for finetuning the model
"""
def __init__(
self, dataframe, tokenizer, source_len, target_len, source_text, target_text
):
"""
Initializes a Dataset class
Args:
dataframe (pandas.DataFrame): Input dataframe
tokenizer (transformers.tokenizer): Transformers tokenizer
source_len (int): Max length of source text
target_len (int): Max length of target text
source_text (str): column name of source text
target_text (str): column name of target text
"""
self.tokenizer = tokenizer
self.data = dataframe
self.source_len = source_len
self.rewrite_len = target_len
self.target_text = self.data[target_text]
self.source_text = self.data[source_text]
def __len__(self):
"""returns the length of dataframe"""
return len(self.target_text)
def __getitem__(self, index):
"""return the input ids, attention masks and target ids"""
source_text = str(self.source_text[index])
target_text = str(self.target_text[index])
# cleaning data so as to ensure data is in string type
source_text = " ".join(source_text.split())
target_text = " ".join(target_text.split())
source = self.tokenizer.batch_encode_plus(
[source_text],
max_length=self.source_len,
pad_to_max_length=True,
truncation=True,
padding="max_length",
return_tensors="pt",
)
target = self.tokenizer.batch_encode_plus(
[target_text],
max_length=self.rewrite_len,
pad_to_max_length=True,
truncation=True,
padding="max_length",
return_tensors="pt",
)
source_ids = source["input_ids"].squeeze()
source_mask = source["attention_mask"].squeeze()
target_ids = target["input_ids"].squeeze()
target_mask = target["attention_mask"].squeeze()
return {
"source_ids": source_ids.to(dtype=torch.long),
"source_mask": source_mask.to(dtype=torch.long),
"target_ids": target_ids.to(dtype=torch.long),
"target_ids_y": target_ids.to(dtype=torch.long),
}
def train(epoch, tokenizer, model, device, loader, optimizer,summary_writer,output_dir):
"""
Function to be called for training with the parameters passed from main function
"""
model.train()
for _, data in enumerate(loader, 0):
y = data["target_ids"].to(device, dtype=torch.long)
y_ids = y[:, :-1].contiguous()
lm_labels = y[:, 1:].clone().detach()
lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
ids = data["source_ids"].to(device, dtype=torch.long)
mask = data["source_mask"].to(device, dtype=torch.long)
outputs = model(
input_ids=ids,
attention_mask=mask,
decoder_input_ids=y_ids,
labels=lm_labels,
)
loss = outputs[0]
optimizer.zero_grad()
loss.backward()
optimizer.step()
summary_writer.add_scalar('epoch/loss_{}'.format(epoch), loss.item(), _)
if _%100000==0 and _ > 0:
print(f"[Saving Model]...\n")
# Saving the model after training
path = os.path.join(output_dir, 'model_epoch{}_step{}'.format(epoch + 1,_))
if not os.path.exists(path):
os.mkdir(path)
model.save_pretrained(path)
tokenizer.save_pretrained(path)
def T5Trainer(
dataframe, source_text, target_text, model_params, output_dir="./outputs/"
):
"""
T5 trainer
"""
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(model_params["SEED"]) # pytorch random seed
np.random.seed(model_params["SEED"]) # numpy random seed
# logging
print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")
# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
model = model.to(device)
# logging
print(f"[Data]: Reading data...\n")
# Importing the raw dataset
dataframe = dataframe[[source_text, target_text]]
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest for validation.
train_size = 1
train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
train_dataset = train_dataset.reset_index(drop=True)
print(f"FULL Dataset: {dataframe.shape}")
print(f"TRAIN Dataset: {train_dataset.shape}")
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = YourDataSetClass(
train_dataset,
tokenizer,
model_params["MAX_SOURCE_TEXT_LENGTH"],
model_params["MAX_TARGET_TEXT_LENGTH"],
source_text,
target_text,
)
# Defining the parameters for creation of dataloaders
train_params = {
"batch_size": model_params["TRAIN_BATCH_SIZE"],
"shuffle": True,
"num_workers": 0,
}
# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
# Defining the optimizer that will be used to tune the weights of the network in the training session.
optimizer = torch.optim.Adam(
params=model.parameters(), lr=model_params["LEARNING_RATE"]
)
# Training loop
print(f"[Initiating Fine Tuning]...\n")
for epoch in range(model_params["TRAIN_EPOCHS"]):
summary_writer = SummaryWriter(log_dir="t5/summary_task")
train(epoch, tokenizer, model, device, training_loader, optimizer,summary_writer,output_dir)
print(f"[Saving Model]...\n")
# Saving the model after training
path = os.path.join(output_dir, 'model_epoch{}'.format(epoch + 1))
if not os.path.exists(path):
os.mkdir(path)
model.save_pretrained(path)
tokenizer.save_pretrained(path)
print(
f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
)
if __name__ == '__main__':
model_params = {
"MODEL": "ClueAI/ChatYuan-large-v1", # model_type: t5-base/t5-large
"TRAIN_BATCH_SIZE": 8, # training batch size
"TRAIN_EPOCHS": 3, # number of training epochs
"LEARNING_RATE": 1e-4, # learning rate
"MAX_SOURCE_TEXT_LENGTH": 768, # max length of source text
"MAX_TARGET_TEXT_LENGTH": 512, # max length of target text
"SEED": 42, # set seed for reproducibility
}
train_dataframe = pd.read_csv("data/new_data.txt", sep='\t')
T5Trainer(train_dataframe, "input", "label", model_params)
训练数据集:
链接:百度网盘 请输入提取码
提取码:nrb9
关于模型的应用部分:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2023/2/27 16:39
# @Author : sparkle_code_guy
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("ClueAI/ChatYuan-large-v1")
model = T5ForConditionalGeneration.from_pretrained("ClueAI/ChatYuan-large-v1")
import torch
# 修改colab笔记本设置为gpu,推理更快
device = torch.device('cuda')
model.to(device)
def preprocess(text):
text = text.replace("\n", "\\n").replace("\t", "\\t")
return text
def postprocess(text):
return text.replace("\\n", "\n").replace("\\t", "\t")
def answer(text, sample=True, top_p=1, temperature=0.7):
'''sample:是否抽样。生成任务,可以设置为True;
top_p:0-1之间,生成的内容越多样'''
text = preprocess(text)
print(len(text))
encoding = tokenizer(text=[text], truncation=True, padding=True, max_length=768, return_tensors="pt").to(device)
if not sample:
out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
num_beams=1, length_penalty=0.6)
else:
out = model.generate(**encoding, return_dict_in_generate=True, output_scores=False, max_new_tokens=512,
do_sample=True, top_p=top_p, temperature=temperature, no_repeat_ngram_size=3)
out_text = tokenizer.batch_decode(out["sequences"], skip_special_tokens=True)
return postprocess(out_text[0])
def rewrite_message(input):
print("query message:",input)
answer_message_list=[]
for each in range(4):
answer_message_list.append("方案{0}:".format(each) + answer_message(input) )
return "\n\n".join(answer_message_list)
def answer_message(input):
input_format = input.replace("\n", "。")
input_text = "用户:" + input_format + "\n小智:"
output_text = answer(input_text)
return f"{output_text}"
import gradio as gr
examples_list = [
"example1"]
synthesis_interface = gr.Interface(rewrite_message,
inputs=gr.components.Textbox(lines=10,interactive=True,placeholder="enter your question ..."),
outputs=gr.components.Textbox(lines=10,interactive=False),
cache_examples=False,
title="问答",
examples_per_page=5,
examples=examples_list,
live=False)
synthesis_interface.launch(share=False,server_name='0.0.0.0',server_port=7860)
更多推荐
已为社区贡献1条内容
所有评论(0)