训练模型transformers综合总结(⼀)
吴建豪老婆预训练模型transformers综合总结(⼀)
这是我对transformers库查看了原始⽂档后,进⾏的学习总结。
第⼀部分是将如何调⽤加载本地模型,使⽤模型,修改模型,保存模型
之后还会更新如何使⽤⾃定义的数据集训练以及对模型进⾏微调,感觉这样这个库基本就能玩熟了。
# 加载本地模型须知
* 1.使⽤transformers库加载预训练模型,99%的时间都是⽤于模型的下载。
为此,我直接从清华⼤学软件("mirrors.tuna.tsinghua.edu/hugging-face-models/")把模型放在了我的本地⽬录地址:"H:\\code\\Model\\"下,这⾥可以进⾏修改。
* 2.下载的模型通常会是"模型名称-"+"config.json"的格式例如(bert-base-cased-finetuned-mrpc-config.json),但如果使⽤transformers库加载本地模型,需要的是模型路径中是config.json、、pytorch_model.bin、tf_model.h5、tokenizer.json等形式,为此在加载前,需要将把⽂件前⾯的模型名称,才能加载成功
我⾃⼰写的处理代码如下:
1. #coding=utf-8
2. import os
3. import os.path
4. # 模型存放路径
5. rootdir = r"H:\code\Model\bert-large-uncased-whole-word-masking-finetuned-squad"# 指明被遍历的⽂件夹
6.
7. for parent,dirnames,filenames in os.walk(rootdir):#三个参数:分别返回1.⽗⽬录 2.所有⽂件夹名字(不含路径) 3.所有⽂件名字
8. for filename in filenames:#⽂件名
9. # nameList=filename.split('.')
0. # print(nameList)
1. print(filename)
2. # filenew=nameList[0]+'.jpg'
3. # print(filenew)
4. #模型的名称
5. place('bert-large-uncased-whole-word-masking-finetuned-squad-','')
6. os.rename(os.path.join(parent,filename),os.path.join(parent,newName))#重命名
处理完后就可以使⽤transformers库进⾏代码加载了。
模型使⽤
序列分类(以情感分类为例)
1.使⽤管道
1. model_path="H:\\code\\Model\\bert-base-cased-finetuned-mrpc\\"
2.
3. from transformers import pipeline
4. #使⽤当前模型+使⽤Tensorflow框架,默认应该是使⽤PYTORCH框架
5. nlp = pipeline("sentiment-analysis",model=model_path, tokenizer=model_path, framework="tf")
送情郎 岳云鹏6. result = nlp("I hate you")[0]
7. print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
8. result = nlp("I love you")[0]
9. print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
2.直接使⽤模型
1. model_path="H:\\code\\Model\\bert-base-cased-finetuned-mrpc\\"
2. #pytorch框架
3.
4. from transformers import AutoTokenizer, AutoModelForSequenceClassification
5. import torch
6. tokenizer = AutoTokenizer.from_pretrained(model_path)
7. model = AutoModelForSequenceClassification.from_pretrained(model_path)
8. classes = ["not paraphrase", "is paraphrase"]
9. sequence_0 = "The company HuggingFace is based in New York City"
你爱我坏原唱是谁
0. sequence_1 = "Apples are especially bad for your health"
1. sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
2. paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
3. not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
4. paraphrase_classification_logits = model(**paraphrase).logits
5. not_paraphrase_classification_logits = model(**not_paraphrase).logits
6. paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
7. not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
8. # Should be paraphrase
9. for i in range(len(classes)):
0. print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
1. # Should not be paraphrase
2. for i in range(len(classes)):
3. print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
24.
5. #tensorflow框架
6. from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
7. import tensorflow as tf
8. tokenizer = AutoTokenizer.from_pretrained(model_path)
9. model = TFAutoModelForSequenceClassification.from_pretrained(model_path)
0. classes = ["not paraphrase", "is paraphrase"]
1. sequence_0 = "The company HuggingFace is based in New York City"
2. sequence_1 = "Apples are especially bad for your health"
3. sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
4. paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
5. not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")
6. paraphrase_classification_logits = model(paraphrase)[0]
7. not_paraphrase_classification_logits = model(not_paraphrase)[0]
8. paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
9. not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
这一生因为爱你才清楚0. # Should be paraphrase
1. for i in range(len(classes)):
2. print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
3. # Should not be paraphrase
4. for i in range(len(classes)):
5. print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
提取式问答
1.使⽤管道
1. model_path="H:\\code\\Model\\bert-large-uncased-whole-word-masking-finetuned-squad\\"
2.
3. from transformers import pipeline
4. nlp = pipeline("question-answering",model=model_path, tokenizer=model_path)
5. context = r"""
6. Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
7. question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
8. a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py scri
pt.
9. """
0. result = nlp(question="What is extractive question answering?", context=context)
1. print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
2. result = nlp(question="What is a good example of a question answering dataset?", context=context)
3. print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
2.直接使⽤模型
肖战新歌光点1. model_path="H:\\code\\Model\\bert-large-uncased-whole-word-masking-finetuned-squad\\"
2. #使⽤pytorch框架
3. from transformers import AutoTokenizer, AutoModelForQuestionAnswering
4. import torch
5. tokenizer = AutoTokenizer.from_pretrained(model_path)
6. model = AutoModelForQuestionAnswering.from_pretrained(model_path)
7. text = r"""
8.  Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
9. architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
0. Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
1. TensorFlow
2.0 and PyTorch.
2. """
3. questions = [
4. "How many pretrained models are available in  Transformers?",
5. "What does  Transformers provide?",
6. "  Transformers provides interoperability between which frameworks?",
7. ]
8. for question in questions:
9. inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
0. input_ids = inputs["input_ids"].tolist()[0]
1. text_tokens = vert_ids_to_tokens(input_ids)
2. outputs = model(**inputs)
3. answer_start_scores = outputs.start_logits
4. answer_end_scores = d_logits吴越小三
5. answer_start = torch.argmax(
6. answer_start_scores
7. ) # Get the most likely beginning of answer with the argmax of the score
8. answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
9. answer = vert_tokens_to_vert_ids_to_tokens(input_ids[answer_start:answer_end]))
0. print(f"Question: {question}")
1. print(f"Answer: {answer}")
32.
3. #使⽤tensorflow框架
4. from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
5. import tensorflow as tf
6. tokenizer = AutoTokenizer.from_pretrained(model_path)
7. model = TFAutoModelForQuestionAnswering.from_pretrained(model_path)
8. text = r"""
9.  Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
0. architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
1. Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep intero
perability between
2. TensorFlow 2.0 and PyTorch.
3. """
4. questions = [
5. "How many pretrained models are available in  Transformers?",
6. "What does  Transformers provide?",
7. "  Transformers provides interoperability between which frameworks?",
8. ]
9. for question in questions:
0. inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
1. input_ids = inputs["input_ids"].numpy()[0]
2. text_tokens = vert_ids_to_tokens(input_ids)
3. outputs = model(inputs)
4. answer_start_scores = outputs.start_logits
5. answer_end_scores = d_logits
6. answer_start = tf.argmax(