采用HttpClient和Jsoup实现简单的网页爬虫
在我们的学习过程中,有些时候难免可能需要在网上爬一些数据之类的,没有学过Python爬虫可能让你有些手足无措,这里我们就用Java程序实现一个简单的的网页爬虫程序。
jar包
HttpClient的使用
package tqb.test.httpclient;
import java.io.IOException;
import javax.swing.text.html.parser.Entity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.junit.Test;
/**
* HttpClient的使用
* @author tqb
*
*/
public class Demo {
@Test
public void fun() throws ClientProtocolException, IOException{
/*
* 创建HttpClient对象
*/
HttpClient http = new DefaultHttpClient();
/*
* 创建要连接的url字符串,这里是我的博客地址
*/
String url = "https://blog.****.net/weixin_42061805";
/*
* 设置代理服务器,有些网站可能有反爬虫,设置这个以防该网站对我的ip地址进行封锁
* 这里我们不设置代理服务器了,有兴趣的自己在网上寻找代理服务器
*/
// String ip = "";
// int port = 10000;
/*
* 获得HttpClient参数对象并进行设置
*/
http.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 3000).//设置连接超时
setParameter(CoreConnectionPNames.SO_TIMEOUT, 3000);//设置响应超时
//setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost(ip,port));//设置代理服务器
/*
* 设置请求方式
*/
HttpGet request = new HttpGet(url);
/*
* 处理请求并响应
*/
HttpResponse response = http.execute(request);
/*
* 获得响应的内容并打印输出
*/
String text = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(text);//这里打印出的就是当前地址的源码
}
}
Jsoup的使用
package tqb.test.jsoup;
import java.io.IOException;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
public class Demo {
@Test
public void fun() throws IOException{
String url = "https://blog.****.net/weixin_42061805";
/*
* 获取连接对象
*/
Connection conn = Jsoup.connect(url);
/*
* 获得document
*/
Document document = conn.get();
/*
* 设置cssquery表达式得到元素集合
*/
Elements elements = document.select("h4 a");
/*
* 遍历元素集合
*/
for (Element element : elements) {
/*
* 打印该元素的文本和其href属性的值
*/
System.out.println(element.text() + ":" + element.attr("href"));
}
}
}
运行结果
HttpClient和Jsoup的结合使用
package tqb.test.jsoup;
import java.io.IOException;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
public class Demo2 {
@Test
public void fun() throws ParseException, IOException{
String url = "https://blog.****.net/weixin_42061805";
HttpClient http = new DefaultHttpClient();
HttpGet request = new HttpGet(url);
HttpResponse response = http.execute(request);
String text = EntityUtils.toString(response.getEntity(),"utf-8");
/*
* 解析内容
*/
Document document = Jsoup.parse(text);
Elements elements = document.select("h4 a");
for (Element element : elements) {
System.out.println(element.text() + ":" + element.attr("href"));
}
}
}