1.下载DataX
2.修改pom.xml
wiriter部分我保留了
<module>mysqlwriter</module>
<module>txtfilewriter</module>
<module>streamwriter</module>
<module>elasticsearchwriter</module>
3.maven打包后上传至服务器
4.全量同步,job.json如下
{
"job":{
"content":[
{
"reader":{
"name":"mysqlreader",
"parameter":{
"column":[
"id",
"title"],
"where":"1=1",
"connection":[
{
"jdbcUrl":[
"jdbc:mysql://127.0.0.1:3306/wifi?autoReconnect=true&useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=false"],
"table":["*****"]
}],
"password":"****",
"username":"****"
}
},
"writer":{
"name":"elasticsearchwriter",
"parameter":{
"endpoint":"http://127.0.0.1:9200",
"accessId":"elastic",
"accessKey":"elastic",
"index":"****",
"type":"_doc",
"cleanup":false,
"discovery":false,
"column":[
{
"name":"id",
"type":"long"
},
{
"name":"title",
"type":"text"
}]
}
}
}],
"setting":{
"speed":{
"channel":1
}
}
}
}
5.着重说一下增量同步
大体思路是每次同步的时候把最大ID写入文件,然后根据ID获取新的数据写入es
用脚本定时任务来实现,sh如下
#!/bin/bash
#datax 数据抽取
#create time:2020-07-14 15:24:00
### every exit != 0 fails the script
# 错误检查,有错误的时候退出
set -e
# 找到 DataX 写入的文本文件,并将内容读取到一个变量中
RESULT_FILE=`ls /data0/datax/job/mysql_max_id_result_*`
MAX_ID=`cat $RESULT_FILE`
echo "$RESULT_FILE $MAX_ID"
# 如果最大时间戳不为 null 的话, 修改全部同步的配置,进行增量更新;
# 设置增量更新过滤条件
WHERE="id > $MAX_ID"
# 将timestamp字符串替换为上次同步的最大时间戳
sed "s/1=1/$WHERE/g" /data0/datax/job/mysqltoes.json > /data0/datax/job/mysqltoes_tmp.json
echo "增量更新"
python /data0/datax/bin/datax.py /data0/datax/job/mysqltoes_tmp.json
# 删除临时文件
rm ./mysqltoes_tmp.json
# 获取本次最大的ID写入文件
python /data0/datax/bin/datax.py /data0/datax/job/get_max_id.json
echo "执行结束"
======================
get_max_id.json如下
{
"job":{
"content":[
{
"reader":{
"name":"mysqlreader",
"parameter":{
"connection":[
{
"jdbcUrl":[
"jdbc:mysql://127.0.0.1:3306/wifi?autoReconnect=true&useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull&useSSL=false"],
"querySql": [
"SELECT max(id) FROM ****"
]
}],
"password":"****",
"username":"****"
}
},
"writer": {
"name": "txtfilewriter",
"parameter": {
"fileName": "mysql_max_id_result",
"fileFormat": "text",
"path": "/data0/datax/job",
"writeMode": "truncate"
}
}
}],
"setting":{
"speed":{
"channel":1
}
}
}
}