SpringBoot—极简版 Java 敏感词检测工具

一、使用步骤

1）引入 Maven 依赖引入最新的版本即可，见附录开源地址。

<dependency>
    <groupId>com.github.houbb</groupId>
    <artifactId>sensitive-word</artifactId>
    <version>0.18.0</version>
</dependency>

2）核心方法使用实例包含了主要的一些功能和方法，如下所示：

常规用法查找替换；
指定替换字符串；
检测忽略大小写，特殊字符，重复字符，简繁体，中英文等；
自定义替换检测策略示例；

package com.example.demo;

import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordReplace;
import com.github.houbb.sensitive.word.api.IWordResult;
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;

import java.util.List;

public class SensitiveWordTestDemo {

    public static void main(String[] args) {
        //testNormal();
        //testDefineReplace();
        //testSensitiveWordResultHandler();
        //testOtherFeatures();
        testMoreFeatures();
    }

    // 常规使用案例：替换敏感词
    public static void testNormal() {
        final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。";
        System.out.println("是否包含铭感词：" + SensitiveWordHelper.contains(text));
        System.out.println("查找第一个铭感词：" + SensitiveWordHelper.findFirst(text));
        System.out.println("查找所有铭感词：" + SensitiveWordHelper.findAll(text));

        System.out.println("替换所有铭感词：" + SensitiveWordHelper.replace(text));
        System.out.println("替换所有铭感词(指定替换符号)：" + SensitiveWordHelper.replace(text, '⭐'));
    }

    /**
     * 此案例讲解：IWordResultHandler 可以对敏感词的结果进行处理，允许用户自定义。
     */
    public static void testSensitiveWordResultHandler() {
        final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。";

        List<String> wordList = SensitiveWordHelper.findAll(text);
        //Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
        System.out.println("1.查找到所有铭感词：" + wordList);

        List<String> wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word());
        //Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString());
        System.out.println("2.默认内置处理(同直接查找到所有敏感词)：" + wordList2);

        List<IWordResult> wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw());
        //Assert.assertEquals("[WordResult{startIndex=0, endIndex=4}, WordResult{startIndex=9, endIndex=12}, WordResult{startIndex=18, endIndex=21}]", wordList3.toString());
        System.out.println("3.查找敏感词单词本身的起始位置到终止位置：" + wordList3);
    }

    // 实例：常规忽略检测特性
    public static void testOtherFeatures() {
        System.out.println("\n其他属性\n");
        String text = "fuCK the bad words.";
        String word = SensitiveWordHelper.findFirst(text);
        //Assert.assertEquals("fuCK", word);
        System.out.println("忽略大小写：" + word);
        System.out.println("替换大小写字符：" + SensitiveWordHelper.replace(text));

        text = "ｆｕｃｋ the bad words.";
        word = SensitiveWordHelper.findFirst(text);
        //Assert.assertEquals("ｆｕｃｋ", word);
        System.out.println("忽略半圆角：" + word);
        System.out.println("替换半圆角字符：" + SensitiveWordHelper.replace(text));

        text = "这个是我的微信：9⓿二肆⁹₈③⑸⒋➃㈤㊄";
        List<String> wordList = SensitiveWordBs.newInstance().enableNumCheck(true).init().findAll(text);
        //Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString());
        System.out.println("忽略数字的写法：" + wordList.toString());
        System.out.println("替换数字字符：" + SensitiveWordBs.newInstance().enableNumCheck(true).init().replace(text));

        text = "我爱我的祖国和五星紅旗。";
        List<String> wordList1 = SensitiveWordHelper.findAll(text);
        //Assert.assertEquals("[五星紅旗]", wordList1.toString());
        System.out.println("检测敏感词简繁体格式是否存在：" + wordList1.toString());

        text = "Ⓕⓤc⒦ the bad words";
        List<String> wordList2 = SensitiveWordHelper.findAll(text);
        //Assert.assertEquals("[Ⓕⓤc⒦]", wordList2.toString());
        System.out.println("检测敏感词是否存在英文的书写格式：" + wordList2.toString());

        text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words";
        List<String> wordList3 = SensitiveWordBs.newInstance()
                .ignoreRepeat(true)
                .init()
                .findAll(text);
        //Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList3.toString());
        System.out.println("检测重复词每个字符是否重复：" + wordList3.toString());
    }

    // 实例：更多检测特性
    public static void testMoreFeatures() {
        // 1.邮箱检测(邮箱等个人信息，默认未启用。)
        String text = "楼主好人，邮箱 sensitiveword@xx.com";
        List<String> wordList = SensitiveWordBs.newInstance().enableEmailCheck(true).init().findAll(text);
        //Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString());
        System.out.println("是否存在邮箱：" + wordList.toString());

        // 2.连续数字检测(一般用于过滤手机号/QQ等广告信息，默认未启用。)
        text = "你懂得：12345678";
        // 默认检测 8 位
        List<String> wordList1 = SensitiveWordBs.newInstance()
                .enableNumCheck(true)
                .init().findAll(text);
        //Assert.assertEquals("[12345678]", wordList.toString());
        System.out.println("是否存在连续数字字符串：" + wordList1);
        // 指定数字的长度，避免误杀
        List<String> wordList2 = SensitiveWordBs.newInstance()
                .enableNumCheck(true)
                .numCheckLen(9)
                .init().findAll(text);
        //Assert.assertEquals("[]", wordList2.toString());
        System.out.println("是否存在连续数字字符串2：" + wordList2.toString());

        // 3.网址检测(用于过滤常见的网址信息，默认未启用, v0.18.0 优化 URL 检测，更加严格，降低误判率)
        text = "点击链接 https://www.baidu.com 查看答案";
        SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().enableUrlCheck(true).init();
        List<String> wordList3 = sensitiveWordBs.findAll(text);
        //Assert.assertEquals("[https://www.baidu.com]", wordList3.toString());
        //Assert.assertEquals("点击链接 ********************* 查看答案", sensitiveWordBs.replace(text));
        System.out.println("是否存在网址信息：" + wordList3.toString());
        System.out.println("是否存在网址信息2并替换：" + sensitiveWordBs.replace(text));

        // 4.IPv4 检测: 避免用户通过 ip 绕过网址检测等，默认未启用。
        text = "个人网站，如果网址打不开可以访问 127.0.0.1。";
        SensitiveWordBs sensitiveWordBs2 = SensitiveWordBs.newInstance().enableIpv4Check(true).init();
        List<String> wordList4 = sensitiveWordBs2.findAll(text);
        //Assert.assertEquals("[127.0.0.1]", wordList4.toString());
        System.out.println("是否存在 IPv4：" + wordList4.toString());
    }

    // 实例：自定义检测替换策略
    public static void testDefineReplace() {
        System.out.println("自定义敏感词替换策略：（策略：指定敏感词替换）");
        final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。";
        MySensitiveWordReplace replace = new MySensitiveWordReplace();
        String result = SensitiveWordHelper.replace(text, replace);
        System.out.println("自定义替换策略结果：" + result);
    }
}

class MySensitiveWordReplace implements IWordReplace {

    @Override
    public void replace(StringBuilder stringBuilder, char[] chars, IWordResult wordResult, IWordContext iWordContext) {
        String sensitiveWord = InnerWordCharUtils.getString(chars, wordResult);
        // 自定义不同的敏感词替换策略，可以从数据库等地方读取
        if ("五星红旗".equals(sensitiveWord)) {
            stringBuilder.append("国家旗帜");
        } else if ("毛主席".equals(sensitiveWord)) {
            stringBuilder.append("教员");
        } else {
            // 其他默认使用 * 代替
            int wordLength = wordResult.endIndex() - wordResult.startIndex();
            for (int i = 0; i < wordLength; i++) {
                stringBuilder.append('*');
            }
        }
    }
}

输出结果展示：

是否包含铭感词：true
查找第一个铭感词：五星红旗
查找所有铭感词：[五星红旗, 毛主席, 天安门]
替换所有铭感词：****迎风飘扬，***的画像屹立在***前。
替换所有铭感词(指定替换符号)：⭐⭐⭐⭐迎风飘扬，⭐⭐⭐的画像屹立在⭐⭐⭐前。
自定义敏感词替换策略：（策略：指定敏感词替换）
自定义替换策略结果：国家旗帜迎风飘扬，教员的画像屹立在***前。
1.查找到所有铭感词：[五星红旗, 毛主席, 天安门]
2.默认内置处理(同直接查找到所有敏感词)：[五星红旗, 毛主席, 天安门]
3.查找敏感词单词本身的起始位置到终止位置：[WordResult{startIndex=0, endIndex=4, type='WORD'}, WordResult{startIndex=9, endIndex=12, type='WORD'}, WordResult{startIndex=18, endIndex=21, type='WORD'}]

其他属性

忽略大小写：fuCK
替换大小写字符：**** the bad words.
忽略半圆角：ｆｕｃｋ
替换半圆角字符：**** the bad words.
忽略数字的写法：[9⓿二肆⁹₈③⑸⒋➃㈤㊄]
替换数字字符：这个是我的微信：************
检测敏感词简繁体格式是否存在：[五星紅旗]
检测敏感词是否存在英文的书写格式：[Ⓕⓤc⒦]
检测重复词每个字符是否重复：[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]
是否存在邮箱：[sensitiveword@xx.com]
是否存在连续数字字符串：[12345678]
是否存在连续数字字符串2：[]
是否存在网址信息：[https://www.baidu.com]
是否存在网址信息2并替换：点击链接 ********************* 查看答案
是否存在 IPv4：[127.0.0.1]

二、核心方法：查找 / 替换

三、更多的检测策略（自定义）

1）邮箱-网址-IPV4-连续字符检测

// 实例：更多检测特性
public static void testMoreFeatures() {
    // 1.邮箱检测(邮箱等个人信息，默认未启用。)
    String text = "楼主好人，邮箱 sensitiveword@xx.com";
    List<String> wordList = SensitiveWordBs.newInstance().enableEmailCheck(true).init().findAll(text);
    //Assert.assertEquals("[sensitiveword@xx.com]", wordList.toString());
    System.out.println("是否存在邮箱：" + wordList.toString());

    // 2.连续数字检测(一般用于过滤手机号/QQ等广告信息，默认未启用。)
    text = "你懂得：12345678";
    // 默认检测 8 位
    List<String> wordList1 = SensitiveWordBs.newInstance()
        .enableNumCheck(true)
        .init().findAll(text);
    //Assert.assertEquals("[12345678]", wordList.toString());
    System.out.println("是否存在连续数字字符串：" + wordList1);
    // 指定数字的长度，避免误杀
    List<String> wordList2 = SensitiveWordBs.newInstance()
        .enableNumCheck(true)
        .numCheckLen(9)
        .init().findAll(text);
    //Assert.assertEquals("[]", wordList2.toString());
    System.out.println("是否存在连续数字字符串2：" + wordList2.toString());

    // 3.网址检测(用于过滤常见的网址信息，默认未启用, v0.18.0 优化 URL 检测，更加严格，降低误判率)
    text = "点击链接 https://www.baidu.com 查看答案";
    SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().enableUrlCheck(true).init();
    List<String> wordList3 = sensitiveWordBs.findAll(text);
    //Assert.assertEquals("[https://www.baidu.com]", wordList3.toString());
    //Assert.assertEquals("点击链接 ********************* 查看答案", sensitiveWordBs.replace(text));
    System.out.println("是否存在网址信息：" + wordList3.toString());
    System.out.println("是否存在网址信息2并替换：" + sensitiveWordBs.replace(text));

    // 4.IPv4 检测: 避免用户通过 ip 绕过网址检测等，默认未启用。
    text = "个人网站，如果网址打不开可以访问 127.0.0.1。";
    SensitiveWordBs sensitiveWordBs2 = SensitiveWordBs.newInstance().enableIpv4Check(true).init();
    List<String> wordList4 = sensitiveWordBs2.findAll(text);
    //Assert.assertEquals("[127.0.0.1]", wordList4.toString());
    System.out.println("是否存在 IPv4：" + wordList4.toString());
}

2）常规检测：大小写-特殊字符-重复字符-简繁体等

// 实例：常规忽略检测特性
public static void testOtherFeatures() {
    System.out.println("\n其他属性\n");
    String text = "fuCK the bad words.";
    String word = SensitiveWordHelper.findFirst(text);
    //Assert.assertEquals("fuCK", word);
    System.out.println("忽略大小写：" + word);
    System.out.println("替换大小写字符：" + SensitiveWordHelper.replace(text));

    text = "ｆｕｃｋ the bad words.";
    word = SensitiveWordHelper.findFirst(text);
    //Assert.assertEquals("ｆｕｃｋ", word);
    System.out.println("忽略半圆角：" + word);
    System.out.println("替换半圆角字符：" + SensitiveWordHelper.replace(text));

    text = "这个是我的微信：9⓿二肆⁹₈③⑸⒋➃㈤㊄";
    List<String> wordList = SensitiveWordBs.newInstance().enableNumCheck(true).init().findAll(text);
    //Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString());
    System.out.println("忽略数字的写法：" + wordList.toString());
    System.out.println("替换数字字符：" + SensitiveWordBs.newInstance().enableNumCheck(true).init().replace(text));

    text = "我爱我的祖国和五星紅旗。";
    List<String> wordList1 = SensitiveWordHelper.findAll(text);
    //Assert.assertEquals("[五星紅旗]", wordList1.toString());
    System.out.println("检测敏感词简繁体格式是否存在：" + wordList1.toString());

    text = "Ⓕⓤc⒦ the bad words";
    List<String> wordList2 = SensitiveWordHelper.findAll(text);
    //Assert.assertEquals("[Ⓕⓤc⒦]", wordList2.toString());
    System.out.println("检测敏感词是否存在英文的书写格式：" + wordList2.toString());

    text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words";
    List<String> wordList3 = SensitiveWordBs.newInstance()
        .ignoreRepeat(true)
        .init()
        .findAll(text);
    //Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList3.toString());
    System.out.println("检测重复词每个字符是否重复：" + wordList3.toString());
}

3）自定义检测替换策略

自定义检测替换

class MySensitiveWordReplace implements IWordReplace {

    @Override
    public void replace(StringBuilder stringBuilder, char[] chars, IWordResult wordResult, IWordContext iWordContext) {
        String sensitiveWord = InnerWordCharUtils.getString(chars, wordResult);
        // 自定义不同的敏感词替换策略，可以从数据库等地方读取
        if ("五星红旗".equals(sensitiveWord)) {
            stringBuilder.append("国家旗帜");
        } else if ("毛主席".equals(sensitiveWord)) {
            stringBuilder.append("教员");
        } else {
            // 其他默认使用 * 代替
            int wordLength = wordResult.endIndex() - wordResult.startIndex();
            for (int i = 0; i < wordLength; i++) {
                stringBuilder.append('*');
            }
        }
    }
}

使用实例：

// 实例：自定义检测替换策略
public static void testDefineReplace() {
    System.out.println("自定义敏感词替换策略：（策略：指定敏感词替换）");
    final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。";
    MySensitiveWordReplace replace = new MySensitiveWordReplace();
    String result = SensitiveWordHelper.replace(text, replace);
    System.out.println("自定义替换策略结果：" + result);
}

开源地址

https://github.com/houbb/sensitive-word

来源：juejin.cn/post/7392197771052924979

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 203,937评论 6赞 478
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 85,503评论 2赞 381
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 150,712评论 0赞 337
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,668评论 1赞 276
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,677评论 5赞 366
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,601评论 1赞 281
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,975评论 3赞 396
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,637评论 0赞 258
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,881评论 1赞 298
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,621评论 2赞 321
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,710评论 1赞 329
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,387评论 4赞 319
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,971评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,947评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,189评论 1赞 260
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 44,805评论 2赞 349
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,449评论 2赞 342

SpringBoot—极简版 Java 敏感词检测工具

一、使用步骤

1）引入 Maven 依赖引入最新的版本即可，见附录开源地址。

2）核心方法使用实例包含了主要的一些功能和方法，如下所示：

二、核心方法：查找 / 替换

三、更多的检测策略（自定义）

1）邮箱-网址-IPV4-连续字符检测

2）常规检测：大小写-特殊字符-重复字符-简繁体等

3）自定义检测替换策略

推荐阅读更多精彩内容