学习UDF编写流程见:http://www.jianshu.com/p/ff0913045610
1.截取请求地址
代码:
package hiveUDF;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
*
* 从 "GET /course/view.php?id=27 HTTP/1.1" 中获取请求地址,即 /course/view.php?id=27
*
*/
public class TruncationRequestAdd extends UDF {
public Text evaluate(Text add) {
// 过滤
if (add == null) {
return null;
}
// 按照空格分割
String[] strings = add.toString().split(" ");
// 过滤分割后长度小于3的字符
if (strings.length < 3) {
return null;
}
// 设置返回的结果
Text result = new Text(strings[1]);
return result;
}
}
-
截取主地址
代码:
package hiveUDF;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
*
* 从"http://www.ibeifeng.com/user.php?act=mycourse"提取主地址,即"http://www.ibeifeng.com"
*
*/
public class TruncationMainAdd extends UDF {
public Text evaluate(Text add) {
// 过滤为null的输入
if (add == null) {
return null;
}
String address = add.toString();
// 过滤不是http://开头的输入
if (!address.startsWith("http://")) {
return null;
}
// 模式匹配
Pattern p = Pattern.compile("http://[^/]+(/\\S*)");
Matcher m = p.matcher(address);
// 获取分组 即 /user.php?act=mycourse
String s = null;
if (m.find()) {
s = m.group(1);
}
// 索引
int index = address.lastIndexOf(s);
// 截取
address = address.substring(0, index);
// 结果
Text result = new Text();
// 构造结果
result.set(address);
return result;
}
}
- 转换日期格式
把日期格式为: "31/Aug/2015:00:04:37 +0800"
转换为: "2015-08-31 00:04:37"
代码:
package hiveUDF;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* 把格式为: "31/Aug/2015:00:04:37 +0800" 转换为: "2015-08-31 00:04:37"
*/
public class DateTransform extends UDF {
private final SimpleDateFormat inputFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
private final SimpleDateFormat outputFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public Text evaluate(Text input) {
// 过滤
if (input == null) {
return null;
}
Text output = new Text();
String inputDate = input.toString();
try {
// parse
Date parseDate = inputFormat.parse(inputDate);
// format
String outputDate = outputFormat.format(parseDate);
// set
output.set(outputDate);
} catch (Exception e) {
e.printStackTrace();
return null;
}
return output;
}
}