一种基于标签属性的网页去噪方法
http://wenku.baidu.com/link?url=5A1-yUkoNYDwKTEyMWXZBb5L3drtTD6Hjg-w9EGfpMVLPPsatMX71zHZbAmyExsNbBOUIZshPU31GqU6Zpiwd07aaNGDDEB_Purp4fZDnNS
论文 http://www.cqvip.com/read/read.aspx?id=1005358363#
西安交通大学的论文 http://www.taodocs.com/p-41624911.html
一种基于内容规则的网页去噪算法
http://wenku.baidu.com/link?url=pWisCo4IfVeOiq0q6LhpbbDUoysCItjChi5mKGSpM_NzwbD9EDc8HlFQ0I1Lpzt-3uKUTqscoHcyyaLfVRJYcGOe35YEQlIDqyg11gVjTKm
一个网友提供的接口
http://www.weixinxi.wang/open/extract.html
public class Parser
{
private static String verifyurl(String url)
{
String vurl = null;
if ((url == null) || (url.equals("")))
{
vurl = "http://www.weixinxi.wang/service/parser?url=http://www.weixinxi.wang/blog/aitrcle.html?id=68";
}
else {
vurl = "http://www.weixinxi.wang/service/parser?url=" + url;
}
return vurl;
}
public static String getjson(String urls) {
String jsonstr = null;
String urlStr = urls;
try {
urlStr = verifyurl(urls);
URL url = new URL(urlStr);
HttpURLConnection httpConn = (HttpURLConnection)url.openConnection();
httpConn.setConnectTimeout(3000);
httpConn.setRequestMethod("GET");
httpConn.setRequestProperty("User-agent", "Mozilla/4.0");
int respCode = httpConn.getResponseCode();
ByteArrayOutputStream out = null;
InputStream inputStream = null;
byte[] buffer = (byte[])null;
int len = 0;
if (respCode == 200)
{
buffer = new byte[1024];
inputStream = httpConn.getInputStream();
out = new ByteArrayOutputStream();
try {
while ((len = inputStream.read(buffer, 0, buffer.length)) != -1) {
out.write(buffer, 0, len);
}
jsonstr = new String(out.toByteArray());
}
catch (Exception localException)
{
}
}
}
catch (Exception localException1)
{
}
return jsonstr;
}
public static void main(String[] args)
{
System.out.println(getjson("http://news.xinhuanet.com/legal/2015-07/30/c_1116096002.htm"));
}
}