读取数据
import pandas as pd
df = pd.read_csv('data.csv')
过滤非ASC字符
df['description'].str.replace(r'[^\x00-\x7F]+', '')
过滤数字
df['description'].str.replace('\d+', '')
去停用词
from nltk.corpus import stopwords
stop = stopwords.words('english')
content['description'] = content['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
从HTML中提取纯文本
def clean_html(html):
"""
Copied from NLTK package.
Remove HTML markup from the given string.
:param html: the HTML string to be cleaned
:type html: str
:rtype: str
"""
# First we remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
# Then we remove html comments. This has to be done before removing regular
# tags since comments can contain '>' characters.
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
# Next we can remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally, we deal with whitespace
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r"http\S+", '', cleaned)
# Remove punctuation
cleaned = cleaned.translate(None, string.punctuation)
return cleaned.strip()