IMDB数据集下载和探索
根据TensorFlow官方教程实现:
# -*- coding: utf-8 -*-
import tensorflow as tf
from tensorflow import keras
import numpy as np
# 查看tensorflow版本
print(tf.__version__)
# 下载imdb数据集
imdb = keras.datasets.imdb
# 参数num_words=10000保留训练数据中出现频率最高的10,000个单词
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
# 探索数据
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
print(train_data[0])
# 每篇文本长度不同
print(len(train_data[0]), len(train_data[1]))
# 将arry从整数转为单词
word_index = imdb.get_word_index()
reverse_word_index = {value:key for key, value in word_index.items()}
content = []
for text in train_data:
text_words = []
content.append(' '.join([reverse_word_index[num] for num in text]))
# 将数据转化成张量
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
padding='post',
maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
padding='post',
maxlen=256)
print(train_data[0])
# 建立模型
vocab_size = 10000
model = keras.Sequential()
# Embedding层将正整数转换为具有固定大小的向量
model.add(keras.layers.Embedding(vocab_size, 16))
# GlobalAveragePooling1D对序列维数进行平均,输出为一个1*1*D的张量。
model.add(keras.layers.GlobalAveragePooling1D())
# 16个隐藏单元的全连接(密集)层
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
# 二分类问题,选择binary_crossentropy作为损失函数
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['acc'])
# 构建数据集 取前10000条数据作为验证集
x_val = train_data[:10000]
partial_x_train = train_data[10000:]
y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]
history = model.fit(partial_x_train,
partial_y_train,
epochs=40,
batch_size=512,
validation_data=(x_val, y_val),
verbose=1)
# 评价模型
result = model.evaluate(test_data, test_labels)
print(result)
THUCNews数据集下载和探索
根据githut进行复现
# -*- coding: utf-8 -*-
"""
Created on Sun May 12 16:07:05 2019
@author: pc
"""
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from collections import Counter
TRAIN_PATH = 'E:/task1/cnews.train.txt'
VAL_PATH = 'E:/task1/cnews.val.txt'
TEST_PATH = 'E:/task1/cnews.test.txt'
VOCAB_SIZE = 5000
MAX_LEN = 600
BATCH_SIZE = 64
def read_file(file_name):
'''
读文件
'''
file_path = {'train': TRAIN_PATH, 'val': VAL_PATH, 'test': TEST_PATH}
contents = []
labels = []
with open(file_path[file_name], 'r', encoding='utf-8') as f:
for line in f:
try:
labels.append(line.strip().split('\t')[0])
contents.append(line.strip().split('\t')[1])
except:
pass
data = pd.DataFrame()
data['text'] = contents
data['label'] = labels
return data
def build_vocab(data):
'''
构建词汇表,
使用字符级的表示
'''
all_content = []
for _, text in data.iterrows():
all_content.extend(text['text'])
counter = Counter(all_content)
count_pairs = counter.most_common(VOCAB_SIZE - 1)
words = [i[0] for i in count_pairs]
words = ['<PAD>'] + list(words)
return words
def read_vocab(words):
words_id = dict(zip(words, range(len(words))))
return words_id
def read_category(data):
'''
将分类目录固定,转换为{类别: id}表示
'''
category = list(data['label'].drop_duplicates())
return dict(zip(category, range(len(category))))
def to_words(content, words):
return ' '.join(words[i] for i in content)
def preocess_file(data, words_id, category_id):
"""
将文件转换为id表示
"""
content = data['text']
labels = data['label']
content_id = []
label_id = []
for text, label in zip(content, labels):
content_id.append([words_id[i] for i in text if i in words_id])
label_id.append(category_id[label])
# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad = keras.preprocessing.sequence.pad_sequences(content_id, MAX_LEN)
y_pad = keras.utils.to_categorical(label_id, num_classes=len(category_id))
return x_pad, y_pad
def batch_iter(x, y):
'''
为神经网络的训练准备经过shuffle的批次的数据
'''
num_batch = int((len(x) - 1) / BATCH_SIZE) + 1
indices = np.random.permutation(np.arange(len(x)))
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i * BATCH_SIZE
end_id = min((i + 1) * BATCH_SIZE, len(x))
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
train = read_file('train')
# 查看label类别
print(train['label'].drop_duplicates())
words = build_vocab(train)
words_id = read_vocab(words)
category_id = read_category(train)
x_pad, y_pad = preocess_file(train, words_id, category_id)
batch_iter(x_pad, y_pad)
test = read_file('test')
val = read_file('val')
对于函数batch_iter(x, y)
的使用还存在疑惑,还有待学习