0
本文作者: AI研習(xí)社-譯站 | 2020-08-26 14:34 |
字幕組雙語(yǔ)原文:Keras 教程:BERT 文本摘要
英語(yǔ)原文:BERT (from HuggingFace Transformers) for Text Extraction
這個(gè)演示使用了SQuAD (Stanford question - answer Dataset)。在SQuAD 數(shù)據(jù)集中,輸入由一個(gè)問(wèn)題和一個(gè)上下文段落組成。目標(biāo)是找到回答問(wèn)題的段落的跨度。我們使用“精確匹配(Exact Match)”指標(biāo)來(lái)評(píng)估我們?cè)谶@些數(shù)據(jù)上的表現(xiàn),它度量了精確匹配任何一個(gè)真實(shí)答案的預(yù)測(cè)的百分比。
我們對(duì)一個(gè)BERT模型進(jìn)行微調(diào),如下所示:
將上下文和問(wèn)題作為輸入,輸入給BERT。
取兩個(gè)向量S和T它們的維數(shù)等于BERT中隱藏狀態(tài)的維數(shù)。
計(jì)算每個(gè)token作為答案范圍的開(kāi)始和結(jié)束的概率。一個(gè)token作為答案開(kāi)始的概率是由S和在最后一層BERT中表示的token之間的點(diǎn)積給出的,然后是所有token的softmax。token作為最終答案的概率的計(jì)算方法與向量T類似。
微調(diào)BERT,學(xué)習(xí)S和T。
參考:
設(shè)置:
import os import re import json import string import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers from tokenizers import BertWordPieceTokenizer from transformers import BertTokenizer,TFBertModel,Bert Configmax_len = 384 configuration = BertConfig() # default paramters and configuration for BERT |
# Save the slow pretrained tokenizerslow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")save_path = "bert_base_uncased/"if not os.path.exists(save_path): os.makedirs(save_path)slow_tokenizer.save_pretrained(save_path)# Load the fast tokenizer from saved filetokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) |
train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"train_path = keras.utils.get_file("train.json", train_data_url)eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"eval_path = keras.utils.get_file("eval.json", eval_data_url) |
遍歷JSON文件,把每行記錄都保存為SquadExample對(duì)象.
遍歷每個(gè)SquadExample對(duì)象來(lái)創(chuàng)建x_train, y_train, x_eval, y_eval.
class SquadExample: def __init__(self, question, context, start_char_idx, answer_text, all_answers): self.question = question self.context = context self.start_char_idx = start_char_idx self.answer_text = answer_text self.all_answers = all_answers self.skip = False def preprocess(self): context = self.context question = self.question answer_text = self.answer_text start_char_idx = self.start_char_idx # Clean context, answer and question context = " ".join(str(context).split()) question = " ".join(str(question).split()) answer = " ".join(str(answer_text).split()) # Find end character index of answer in context end_char_idx = start_char_idx + len(answer) if end_char_idx >= len(context): self.skip = True return # Mark the character indexes in context that are in answer is_char_in_ans = [0] * len(context) for idx in range(start_char_idx, end_char_idx): is_char_in_ans[idx] = 1 # Tokenize context tokenized_context = tokenizer.encode(context) # Find tokens that were created from answer characters ans_token_idx = [] for idx, (start, end) in enumerate(tokenized_context.offsets): if sum(is_char_in_ans[start:end]) > 0: ans_token_idx.append(idx) if len(ans_token_idx) == 0: self.skip = True return # Find start and end token index for tokens from answer start_token_idx = ans_token_idx[0] end_token_idx = ans_token_idx[-1] # Tokenize question tokenized_question = tokenizer.encode(question) # Create inputs input_ids = tokenized_context.ids + tokenized_question.ids[1:] token_type_ids = [0] * len(tokenized_context.ids) + [1] * len( tokenized_question.ids[1:] ) attention_mask = [1] * len(input_ids) # Pad and create attention masks. # Skip if truncation is needed padding_length = max_len - len(input_ids) if padding_length > 0: # pad input_ids = input_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) elif padding_length < 0: # skip self.skip = True return self.input_ids = input_ids self.token_type_ids = token_type_ids self.attention_mask = attention_mask self.start_token_idx = start_token_idx self.end_token_idx = end_token_idx self.context_token_to_char = tokenized_context.offsetswith open(train_path) as f: raw_train_data = json.load(f)with open(eval_path) as f: raw_eval_data = json.load(f)def create_squad_examples(raw_data): squad_examples = [] for item in raw_data["data"]: for para in item["paragraphs"]: context = para["context"] for qa in para["qas"]: question = qa["question"] answer_text = qa["answers"][0]["text"] all_answers = [_["text"] for _ in qa["answers"]] start_char_idx = qa["answers"][0]["answer_start"] squad_eg = SquadExample( question, context, start_char_idx, answer_text, all_answers ) squad_eg.preprocess() squad_examples.append(squad_eg) return squad_examplesdef create_inputs_targets(squad_examples): dataset_dict = { "input_ids": [], "token_type_ids": [], "attention_mask": [], "start_token_idx": [], "end_token_idx": [], } for item in squad_examples: if item.skip == False: for key in dataset_dict: dataset_dict[key].append(getattr(item, key)) for key in dataset_dict: dataset_dict[key] = np.array(dataset_dict[key]) x = [ dataset_dict["input_ids"], dataset_dict["token_type_ids"], dataset_dict["attention_mask"], ] y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]] return x, ytrain_squad_examples = create_squad_examples(raw_train_data)x_train, y_train = create_inputs_targets(train_squad_examples)print(f"{len(train_squad_examples)} training points created.")eval_squad_examples = create_squad_examples(raw_eval_data)x_eval, y_eval = create_inputs_targets(eval_squad_examples)print(f"{len(eval_squad_examples)} evaluation points created.") |
87599 training points created.10570 evaluation points created. |
用BERT和函數(shù)式API來(lái)構(gòu)建問(wèn)答模塊
def create_model(): ## BERT encoder encoder = TFBertModel.from_pretrained("bert-base-uncased") ## QA Model input_ids = layers.Input(shape=(max_len,), dtype=tf.int32) token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32) attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32) embedding = encoder( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask )[0] start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding) start_logits = layers.Flatten()(start_logits) end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding) end_logits = layers.Flatten()(end_logits) start_probs = layers.Activation(keras.activations.softmax)(start_logits) end_probs = layers.Activation(keras.activations.softmax)(end_logits) model = keras.Model( inputs=[input_ids, token_type_ids, attention_mask], outputs=[start_probs, end_probs], ) loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False) optimizer = keras.optimizers.Adam(lr=5e-5) model.compile(optimizer=optimizer, loss=[loss, loss]) return model |
這段代碼很適合用Google Colab TPU來(lái)跑. 用Colab TPUs, 每個(gè)epoch大概花5-6分鐘即可.
use_tpu = Trueif use_tpu: |
這個(gè)回調(diào)函數(shù)會(huì)在每個(gè)epoch后用驗(yàn)證集數(shù)據(jù)計(jì)算匹配值.
def normalize_text(text): text = text.lower() # Remove punctuations exclude = set(string.punctuation) text = "".join(ch for ch in text if ch not in exclude) # Remove articles regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) text = re.sub(regex, " ", text) # Remove extra white space text = " ".join(text.split()) return textclass ExactMatch(keras.callbacks.Callback): """ Each `SquadExample` object contains the character level offsets for each token in its input paragraph. We use them to get back the span of text corresponding to the tokens between our predicted start and end tokens. All the ground-truth answers are also present in each `SquadExample` object. We calculate the percentage of data points where the span of text obtained from model predictions matches one of the ground-truth answers. """ def __init__(self, x_eval, y_eval): self.x_eval = x_eval self.y_eval = y_eval def on_epoch_end(self, epoch, logs=None): pred_start, pred_end = self.model.predict(self.x_eval) count = 0 eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False] for idx, (start, end) in enumerate(zip(pred_start, pred_end)): squad_eg = eval_examples_no_skip[idx] offsets = squad_eg.context_token_to_char start = np.argmax(start) end = np.argmax(end) if start >= len(offsets): continue pred_char_start = offsets[start][0] if end < len(offsets): pred_char_end = offsets[end][1] pred_ans = squad_eg.context[pred_char_start:pred_char_end] else: pred_ans = squad_eg.context[pred_char_start:] normalized_pred_ans = normalize_text(pred_ans) normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers] if normalized_pred_ans in normalized_true_ans: count += 1 acc = count / len(self.y_eval[0]) print(f"\nepoch={epoch+1}, exact match score={acc:.2f}") |
exact_match_callback = ExactMatch(x_eval, y_eval)model.fit( x_train, y_train, epochs=1, # For demonstration, 3 epochs are recommended verbose=2, batch_size=64, callbacks=[exact_match_callback],) |
epoch=1, exact match score=0.781346/1346 - 350s - activation_7_loss: 1.3488 - loss: 2.5905 - activation_8_loss: 1.2417<tensorflow.python.keras.callbacks.History at 0x7fc78b4458d0> |
雷鋒字幕組是一個(gè)由 AI 愛(ài)好者組成的翻譯團(tuán)隊(duì),匯聚五百多位志愿者的力量,分享最新的海外AI資訊,交流關(guān)于人工智能技術(shù)領(lǐng)域的行業(yè)變革與技術(shù)創(chuàng)新的見(jiàn)解。
團(tuán)隊(duì)成員有大數(shù)據(jù)專家、算法工程師、圖像處理工程師、產(chǎn)品經(jīng)理、產(chǎn)品運(yùn)營(yíng)、IT咨詢?nèi)恕⒃谛熒?;志愿者們?lái)自IBM、AVL、Adobe、阿里、百度等知名企業(yè),北大、清華、港大、中科院、南卡羅萊納大學(xué)、早稻田大學(xué)等海內(nèi)外高校研究所。
如果,你也是位熱愛(ài)分享的AI愛(ài)好者。歡迎與雷鋒字幕組一起,學(xué)習(xí)新知,分享成長(zhǎng)。
雷鋒網(wǎng)雷鋒網(wǎng)
雷峰網(wǎng)版權(quán)文章,未經(jīng)授權(quán)禁止轉(zhuǎn)載。詳情見(jiàn)轉(zhuǎn)載須知。