109-文本分析之文本预处理

参考:《文本数据挖掘——基于R语言》

library(pacman)
p_load(dplyr, stringr, purrr)

1、读取数据

随便文本代替即可,包括两列,一列为文档名或编号,一列为文本内容。

storagebottles <- read.csv("dataset/ali/storagebottles0905.csv", 
                           header = F) %>% 
  set_names(c("sku_name", "sku_price", "sku_sale_volume", "sku_score",
              "sku_ship", "sku_isNewin", "sku_isPromotion", 
              "sku_isTopselling", "shop_name", "sku_link", "category4")) %>%
  distinct(.keep_all = T)

storagebottles <- storagebottles %>% 
  filter(!is.na(sku_name)) %>%
  filter(str_detect(sku_price, "^US")) %>% 
  filter(str_detect(sku_link, "aliexpress")) %>% 
  filter(str_detect(sku_sale_volume, "sold")) %>% 
  mutate(category = "home",
         category2 = "Home Storage",
         category3 = "Storage Bottles & Jars")  %>% 
  mutate(sku_id = str_extract(sku_link, "\\d{16}"),
         sku_link = paste0("http:", sku_link)) %>% 
  mutate(sku_id = as.character(sku_id)) %>% 
  arrange(sku_sale_volume) %>% 
  group_by(sku_id, .drop = T) %>% 
  slice_tail(n=1) %>% 
  ungroup()

df <- select(storagebottles, sku_id, sku_name)

2、文本纠错

p_load(hunspell)

# 检查是否有错
hunspell_check(df$sku_name[1])
## [1] FALSE
# 识别错误单词
bad <- hunspell(df$sku_name[1])
print(bad[[1]])
## [1] "pcs"
# 修正建议
hunspell_suggest(bad[[1]])
## [[1]]
##  [1] "cps"  "cs"   "pecs" "pics" "pis"  "pas"  "pct"  "pus"  "p cs" "PCs"

3、切分

p_load(tokenizers, tidytext)

txt <- paste0(df$sku_name[1:2], collapse = "。")
txt
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
# 段落切分
# 设置切分标记
tokenize_paragraphs(txt, paragraph_break = "。")
## [[1]]
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles"
## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
# 句子切分
tokenize_sentences(txt)
## [[1]]
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。"
## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"
# 词语切分,会同时去除标点,转小写
tokenize_words(txt)
## [[1]]
##  [1] "1000pcs"     "8"           "32mm"        "0.5ml"       "plastic"    
##  [6] "centrifuge"  "tube"        "test"        "tubing"      "vial"       
## [11] "clear"       "plastic"     "container"   "home"        "garden"     
## [16] "storage"     "bottles"     "1000pcs"     "6"           "22mm"       
## [21] "0.2ml"       "plastic"     "bottles"     "gardening"   "storage"    
## [26] "container"   "transparent" "plastic"     "vials"       "pcr"        
## [31] "centrifuge"  "tube"
# 也可以保留标点,去掉数字
tokenize_words(txt, strip_punct = F, strip_numeric = T, simplify = T)
##  [1] "1000pcs"     "*"           "32mm"        "0.5ml"       "plastic"    
##  [6] "centrifuge"  "tube"        "test"        "tubing"      "vial"       
## [11] "clear"       "plastic"     "container"   "home"        "garden"     
## [16] "storage"     "bottles"     "。"          "1000pcs"     "*"          
## [21] "22mm"        "0.2ml"       "plastic"     "bottles"     "gardening"  
## [26] "storage"     "container"   "transparent" "plastic"     "vials"      
## [31] "pcr"         "centrifuge"  "tube"
# n元切分,simplify = T输出向量而非列表
tokenize_ngrams(txt, n = 2, simplify = T)
##  [1] "1000pcs 8"             "8 32mm"                "32mm 0.5ml"           
##  [4] "0.5ml plastic"         "plastic centrifuge"    "centrifuge tube"      
##  [7] "tube test"             "test tubing"           "tubing vial"          
## [10] "vial clear"            "clear plastic"         "plastic container"    
## [13] "container home"        "home garden"           "garden storage"       
## [16] "storage bottles"       "bottles 1000pcs"       "1000pcs 6"            
## [19] "6 22mm"                "22mm 0.2ml"            "0.2ml plastic"        
## [22] "plastic bottles"       "bottles gardening"     "gardening storage"    
## [25] "storage container"     "container transparent" "transparent plastic"  
## [28] "plastic vials"         "vials pcr"             "pcr centrifuge"       
## [31] "centrifuge tube"
# 4、字符切分
tokenize_characters(txt, simplify = T)
##   [1] "1" "0" "0" "0" "p" "c" "s" "8" "3" "2" "m" "m" "0" "5" "m" "l" "p" "l"
##  [19] "a" "s" "t" "i" "c" "c" "e" "n" "t" "r" "i" "f" "u" "g" "e" "t" "u" "b"
##  [37] "e" "t" "e" "s" "t" "t" "u" "b" "i" "n" "g" "v" "i" "a" "l" "c" "l" "e"
##  [55] "a" "r" "p" "l" "a" "s" "t" "i" "c" "c" "o" "n" "t" "a" "i" "n" "e" "r"
##  [73] "h" "o" "m" "e" "g" "a" "r" "d" "e" "n" "s" "t" "o" "r" "a" "g" "e" "b"
##  [91] "o" "t" "t" "l" "e" "s" "1" "0" "0" "0" "p" "c" "s" "6" "2" "2" "m" "m"
## [109] "0" "2" "m" "l" "p" "l" "a" "s" "t" "i" "c" "b" "o" "t" "t" "l" "e" "s"
## [127] "g" "a" "r" "d" "e" "n" "i" "n" "g" "s" "t" "o" "r" "a" "g" "e" "c" "o"
## [145] "n" "t" "a" "i" "n" "e" "r" "t" "r" "a" "n" "s" "p" "a" "r" "e" "n" "t"
## [163] "p" "l" "a" "s" "t" "i" "c" "v" "i" "a" "l" "s" "p" "c" "r" "c" "e" "n"
## [181] "t" "r" "i" "f" "u" "g" "e" "t" "u" "b" "e"

5、扩展缩写

p_load(qdap)

# 连词缩写替换
replace_contraction(c("MR. Jones isn't going."))
## [1] "MR. Jones is not going."
# 缩写替换
replace_abbreviation(c("MR. Jones isn't going."))
## [1] "Mister Jones isn't going."
# 数字替换
replace_number(c(1))
## [1] "one"
# 序词替换
replace_ordinal(c("3rd"))
## [1] "third"
# 符号替换
replace_symbol(c("&"))
## [1] "and"

6、词干提取

stem <- tokenize_word_stems(df$sku_name[1], simplify = T)
stem
##  [1] "1000pcs"   "8"         "32mm"      "0.5ml"     "plastic"   "centrifug"
##  [7] "tube"      "test"      "tube"      "vial"      "clear"     "plastic"  
## [13] "contain"   "home"      "garden"    "storag"    "bottl"

7、词形还原

模型下载地址:
英文:https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe
中文:https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe

p_load(udpipe)

# 从github上下载英文词形还原的模型,也可以下载chinese
# 网络可能导致下载失败
udmodel <- udpipe_download_model(language = "english", 
                                 model_dir = "model/")

# 加载模型
en_model <- udpipe_load_model(udmodel$file_model)
# 词形还原
udpipe_annotate(en_model, stem) %>% 
  as_tibble() %>% 
  # 分词结果,词形还原结果
  select(token, lemma)
## token  lemma
## <chr>  <chr>
## 1000pcs  1000pcs         
## 8        8           
## 32mm 32mm            
## 0.5ml    0.5ml           
## plastic  plastic         
## centrifug    centrifug           
## tube tube            
## test test            
## tube tube            
## vial vial
## clear    clear           
## plastic  plastic         
## contain  contain         
## home home            
## garden   garden          
## storag   storag          
## bottl    bottl
udmodel <- udpipe_download_model(language = "chinese", 
                                 model_dir = "model/")

# 加载模型
cn_model <- udpipe_load_model(udmodel$file_model)
# 中文词形还原,# 转换为UTF-8编码
udpipe_annotate(cn_model, iconv(c("别人笑我忒疯癫"), to = "UTF-8")) %>% 
  as_tibble() %>% 
  # 分词结果,词形还原结果
  select(token, lemma)
## token  lemma
## <chr>  <chr>
## 别    别           
## 人笑   人笑          
## 我忒   我忒          
## 疯    疯           
## 癫    癫

8、词性标注

udpipe_annotate(en_model, df$sku_name[1]) %>% 
  as_tibble() %>% 
  select(token, upos)
## token  upos
## <chr>  <chr>
## 1000 NUM         
## pcs  NOUN            
## 8    NUM         
## *    PUNCT           
## 32   NUM         
## mm   NOUN            
## 0.5  NUM         
## ml   NOUN            
## Plastic  PROPN           
## Centrifuge   PROPN   
……(太多就不一一列出来了)

PROPN表示专有名词,AUX表示助动词,ADJ表示形容词,DET表示限定词,NOUN表示名词,PUNCT表示标点符号

ADJ: adjective
ADP: adposition
ADV: adverb
AUX: auxiliary
CCONJ: coordinating conjunction
DET: determiner
INTJ: interjection
NOUN: noun
NUM: numeral
PART: particle
PRON: pronoun
PROPN: proper noun
PUNCT: punctuation
SCONJ: subordinating conjunction
SYM: symbol
VERB: verb
X: other

9、批量文本预处理

p_load(tidytext)

df <- unnest_tokens(tbl = df,
                    # 输出列名称
                    output = stem, 
                    input = sku_name,
                    # 词干提取
                    token = tokenize_word_stems)
df
## # A tibble: 21,765 × 2
##    sku_id           stem     
##    <chr>            <chr>    
##  1 2251801564728378 1000pcs  
##  2 2251801564728378 8        
##  3 2251801564728378 32mm     
##  4 2251801564728378 0.5ml    
##  5 2251801564728378 plastic  
##  6 2251801564728378 centrifug
##  7 2251801564728378 tube     
##  8 2251801564728378 test     
##  9 2251801564728378 tube     
## 10 2251801564728378 vial     
## # … with 21,755 more rows
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 194,088评论 5 459
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 81,715评论 2 371
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 141,361评论 0 319
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 52,099评论 1 263
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 60,987评论 4 355
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 46,063评论 1 272
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 36,486评论 3 381
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 35,175评论 0 253
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 39,440评论 1 290
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 34,518评论 2 309
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 36,305评论 1 326
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 32,190评论 3 312
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 37,550评论 3 298
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 28,880评论 0 17
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 30,152评论 1 250
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 41,451评论 2 341
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 40,637评论 2 335

推荐阅读更多精彩内容