2019-01-16 解析bert代码

代码文件为bert_lstm_ner.py,下面进行逐行解析：

tf.logging.set_verbosity(tf.logging.INFO)#运行代码时，将会看到info日志输出INFO:tensorflow:loss = 1.18812, step = 1INFO:tensorflow:loss = #0.210323, step = 101INFO:tensorflow:loss = 0.109025, step = 201

processors = {

"ner": NerProcessor

}#定义一个ner：NerProcessor的字典

bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)#将bert参数传到bert_config中

if FLAGS.max_seq_length > bert_config.max_position_embeddings:#假如最大总输入序列长度大于bert最大的wordembedding长度，报错

raise ValueError(

"Cannot use sequence length %d because the BERT model "

"was only trained up to sequence length %d" %

(FLAGS.max_seq_length, bert_config.max_position_embeddings))

# 在train 的时候，才删除上一轮产出的文件，在predicted 的时候不做clean

if FLAGS.clean and FLAGS.do_train:#默认是两个ture

if os.path.exists(FLAGS.output_dir):#假如输出文件位置存在

def del_file(path):#设置个删文件的函数

ls = os.listdir(path)#listdir函数返回文件夹中的所有文件名字

for i in ls:

c_path = os.path.join(path, i)#os.path.join()函数用于路径拼接文件路径

if os.path.isdir(c_path):#如果该文件存在

del_file(c_path)#删除文件

else:

os.remove(c_path)#删除文件

try:

del_file(FLAGS.output_dir)#尝试删除文件，否则报错

except Exception as e:

print(e)

print('pleace remove the files of output dir and data.conf')

exit(-1)

if os.path.exists(FLAGS.data_config_path):#如果保存数据的位置存在

try:

os.remove(FLAGS.data_config_path)#尝试删除

except Exception as e:

print(e)

print('pleace remove the files of output dir and data.conf')

exit(-1)

task_name = FLAGS.task_name.lower()#task_name是要训练的任务的名称，值为ner

if task_name not in processors:#如果processor里面没有ner，报错

raise ValueError("Task not found: %s" % (task_name))

processor = processors[task_name]()#返回NerProcessor（）函数

label_list = processor.get_labels()#label_list值为["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "X", "[CLS]", "[SEP]"]

tokenizer = tokenization.FullTokenizer(

vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)#输出函数（对bert的词汇文件在进行变小写后进行fulltokenizer）

tpu_cluster_resolver = None#不使用tpu集群

if FLAGS.use_tpu and FLAGS.tpu_name:#不考虑

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(

FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2#如果为PER_HOST_V1或PER_HOST_V2，则在每个主机上调用一次input_fn。 #使用每核心输入管道配置，每个核心调用一次。具有全局批量大小

run_config = tf.contrib.tpu.RunConfig(#定义tpu函数

cluster=tpu_cluster_resolver,#false

master=FLAGS.master,#none‘TensorFlow master URL.’

model_dir=FLAGS.output_dir,#输出位置

save_checkpoints_steps=FLAGS.save_checkpoints_steps,#" 保存模型checkpoint的频率."为1000

tpu_config=tf.contrib.tpu.TPUConfig(#定义tpu函数2

iterations_per_loop=FLAGS.iterations_per_loop,#"在每个评估单元调用中要执行多少步骤."1000

num_shards=FLAGS.num_tpu_cores,#tpu核数，8

per_host_input_for_training=is_per_host))#PER_HOST_V2

train_examples = None#none

num_train_steps = None#none

num_warmup_steps = None#none

if os.path.exists(FLAGS.data_config_path):#如果data config 文件，保存训练和dev config存在

with codecs.open(FLAGS.data_config_path) as fd:#打开文件路径

data_config = json.load(fd)#加载数据到data_config中

else:

data_config = {}#否则设为空

if FLAGS.do_train:

# 加载训练数据

if len(data_config) == 0:#如果为空

train_examples = processor.get_train_examples(FLAGS.data_dir)#将训练样本输入到变量中

num_train_steps = int(

len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)#训练执行总批次数为样本长度/训练总批次*训练总次数

num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)#上面数值*进行线性学习率热身训练的比例。

data_config['num_train_steps'] = num_train_steps#数据参数设定1

data_config['num_warmup_steps'] = num_warmup_steps#数据参数设定2

data_config['num_train_size'] = len(train_examples)#数据参数设定3（数据长度）

else:

num_train_steps = int(data_config['num_train_steps'])#直接调用1

num_warmup_steps = int(data_config['num_warmup_steps'])#直接调用2

# 返回的model_dn 是一个函数，其定义了模型，训练，评测方法，并且使用钩子参数，加载了BERT模型的参数进行了自己模型的参数初始化过程

# tf 新的架构方法，通过定义model_fn 函数，定义模型，然后通过EstimatorAPI进行模型的其他工作，Es就可以控制模型的训练，预测，评估工作等。

model_fn = model_fn_builder(

bert_config=bert_config,#从bert文件中获得

num_labels=len(label_list) + 1,#标签数量

init_checkpoint=FLAGS.init_checkpoint,#r'D:\bert\chinese_L-12_H-768_A-12\bert_model.ckpt "初始检查点（通常来自预先训练的bert模型)."

learning_rate=FLAGS.learning_rate,#学习率 5e-5,

num_train_steps=num_train_steps,#总批次

num_warmup_steps=num_warmup_steps,#warmup数

#warmup就是先采用小的学习率（0.01）进行训练，训练了400iterations之后将学习率调整至0.1开始正式训练

use_tpu=FLAGS.use_tpu,#none

use_one_hot_embeddings=FLAGS.use_tpu)#none

print(model_fn)

estimator = tf.contrib.tpu.TPUEstimator(#定义评估器

use_tpu=FLAGS.use_tpu,#none

model_fn=model_fn,#将上面定义的model加入

config=run_config,#将上面定义的runconfig参数加入

train_batch_size=FLAGS.train_batch_size,#训练批次 64

eval_batch_size=FLAGS.eval_batch_size,#评估批次 8

predict_batch_size=FLAGS.predict_batch_size)# 预测批次 8

train_file =r'C:\Users\dell\Desktop\Name-Entity-Recognition-master\BERT-BiLSTM-CRF-NER\train.tf_record'

filed_based_convert_examples_to_features(

train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)#将数据转化为TF_Record 结构，作为模型数据输入：样本，标签，最#大长度，tokenizer，数据

num_train_size = num_train_size = int(data_config['num_train_size'])

tf.logging.info("***** Running training *****")

tf.logging.info(" Num examples = %d", num_train_size)#20864

tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)#64

tf.logging.info(" Num steps = %d", num_train_steps)#978

train_input_fn = file_based_input_fn_builder(

input_file=train_file,#训练文件

seq_length=FLAGS.max_seq_length,#最大序列长度 128

is_training=True,#确定训练

drop_remainder=True)#没查到。。。

estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)#进行训练

if FLAGS.do_eval:#进行评估

if data_config.get('eval.tf_record_path', '') == '':#如果字典中没有评估路径

eval_examples = processor.get_dev_examples(FLAGS.data_dir)#读到data_dir的dev文件

eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")#获得输出位置的eval.tf_record文件

filed_based_convert_examples_to_features(

eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)#将评估文件转换

data_config['eval.tf_record_path'] = eval_file#将评估文件加入数据

data_config['num_eval_size'] = len(eval_examples)#将评估文件长度加入数据

else:

eval_file = data_config['eval.tf_record_path']#将评估数据文件读出

# 打印验证集数据信息

num_eval_size = data_config.get('num_eval_size', 0)#将评估文件长度读出

tf.logging.info("***** Running evaluation *****")

tf.logging.info(" Num examples = %d", num_eval_size)#2318

tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size)#8

eval_steps = None

if FLAGS.use_tpu:#none

eval_steps = int(num_eval_size / FLAGS.eval_batch_size)#不管

eval_drop_remainder = True if FLAGS.use_tpu else False#false

eval_input_fn = file_based_input_fn_builder(

input_file=eval_file,#评估文件

seq_length=FLAGS.max_seq_length,#最大序列长度

is_training=False,#不训练

drop_remainder=eval_drop_remainder)#none

result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)#step=none（这里报错）

output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")#输出文件

with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:

tf.logging.info("***** Eval results *****")

for key in sorted(result.keys()):

tf.logging.info(" %s = %s", key, str(result[key]))#报出文件

writer.write("%s = %s\n" % (key, str(result[key])))#写入文件

# 保存数据的配置文件，避免在以后的训练过程中多次读取训练以及测试数据集，消耗时间

if not os.path.exists(FLAGS.data_config_path):

with codecs.open(FLAGS.data_config_path, 'a', encoding='utf-8') as fd:

json.dump(data_config, fd)#把a作为data_config_path存入data_config

if FLAGS.do_predict:#开始预测

token_path = os.path.join(FLAGS.output_dir, "token_test.txt")#导入测试集输出位置

if os.path.exists(token_path):#如果测试集存在

os.remove(token_path)#删了

with codecs.open(os.path.join(FLAGS.output_dir, 'label2id.pkl'), 'rb') as rf:#打开label2id的文件

label2id = pickle.load(rf)

id2label = {value: key for key, value in label2id.items()}#转成字典

predict_examples = processor.get_test_examples(FLAGS.data_dir)#得到test文件

predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")#得到预测的tf_record文件

filed_based_convert_examples_to_features(predict_examples, label_list,

FLAGS.max_seq_length, tokenizer,

predict_file, mode="test")#建立测试的tf_record文件

tf.logging.info("***** Running prediction*****")

tf.logging.info(" Num examples = %d", len(predict_examples))#4636

tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size)#8

if FLAGS.use_tpu:

# Warning: According to tpu_estimator.py Prediction on TPU is an

# experimental feature and hence not supported here

raise ValueError("Prediction in TPU not supported")

predict_drop_remainder = True if FLAGS.use_tpu else False#false

predict_input_fn = file_based_input_fn_builder(

input_file=predict_file,#输入文件

seq_length=FLAGS.max_seq_length,#最大序列

is_training=False,#不训练

drop_remainder=predict_drop_remainder)#none

predicted_result = estimator.evaluate(input_fn=predict_input_fn)#报错。。。

output_eval_file = os.path.join(FLAGS.output_dir, "predicted_results.txt")#输出预测结果

with codecs.open(output_eval_file, "w", encoding='utf-8') as writer:

tf.logging.info("***** Predict results *****")

for key in sorted(predicted_result.keys()):

tf.logging.info(" %s = %s", key, str(predicted_result[key]))

writer.write("%s = %s\n" % (key, str(predicted_result[key])))#写入文件

result = estimator.predict(input_fn=predict_input_fn)#预测

output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt")#输出文件

def result_to_pair(writer):#这里是写入函数

for predict_line, prediction in zip(predict_examples, result):

idx = 0

line = ''

line_token = str(predict_line.text).split(' ')

label_token = str(predict_line.label).split(' ')

if len(line_token) != len(label_token):

tf.logging.info(predict_line.text)

tf.logging.info(predict_line.label)

for id in prediction:

if id == 0:

continue

curr_labels = id2label[id]

if curr_labels in ['[CLS]', '[SEP]']:

continue

# 不知道为什么，这里会出现idx out of range 的错误。。。do not know why here cache list out of range exception!

try:

line += line_token[idx] + ' ' + label_token[idx] + ' ' + curr_labels + '\n'

except Exception as e:

tf.logging.info(e)

tf.logging.info(predict_line.text)

tf.logging.info(predict_line.label)

line = ''

break

idx += 1

writer.write(line + '\n')

with codecs.open(output_predict_file, 'w', encoding='utf-8') as writer:

result_to_pair(writer)#写入文件

from conlleval import return_report

eval_result = return_report(output_predict_file)#百度找不到，猜测是得到评估结果的函数

print(eval_result)