java爬取资讯(新闻)
本篇博客只是交流学习,如有不妥请联系删除
续接前两篇爬图片和音乐,本次爬了一些文字信息,如果后期有需要再把图片加上
https://blog.****.net/m0_37615458/article/details/103867889
https://blog.****.net/m0_37615458/article/details/103902165
刚才简单的把资讯爬了一下,比较简单的抓取列表页的标题和详情页的文字内容,存到桌面上txt文件
jar包支撑与前两篇一致,不再赘述。
抓取的代码如下
================================================
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import javax.swing.filechooser.FileSystemView;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import spiderkugou.HtmlManage;
import spiderkugou.HttpGetConnect;
/**
* @author dongwn 拟抓取页面的资讯数据
*/
public class TestGetInformation {
static String url = "http://news.chemnet.com/list-11-11-PAGE.html";// 列表页
public static void main(String[] args) throws IOException, ParseException, InterruptedException {
String newUrl = "";// 动态url
for (int i = 1; i < 50; i++) {// 只抓50页吧
newUrl = url.replace("PAGE", i + "");
getConnection(newUrl);
Thread.sleep(1000);
}
}
// 获取链接获取元素
@SuppressWarnings("static-access")
public static void getConnection(String url) throws IOException, ParseException {
StringBuffer sb = new StringBuffer();
HttpGetConnect connect = new HttpGetConnect();
String content = connect.connect(url, "utf-8");
HtmlManage html = new HtmlManage();
Document doc = html.manage(content);// 转 Document
Elements elements = doc.select(".content-list>ul>li");
sb.append(System.getProperty("user.name") + "----->>>"
+ new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").format(new Date()));
sb.append("\r\n");// 换行
sb.append("\r\n");// 换行
for (Element ele : elements) {
sb.append("标题---->>>");
sb.append(ele.select("a").text().trim());// 获取标题
sb.append("\r\n");// 换行
String detailUrl = "http://news.chemnet.com" + ele.select("a").attr("href");
String detailcontent = connect.connect(detailUrl, "utf-8");
HtmlManage detailhtml = new HtmlManage();
Document detailDoc = detailhtml.manage(detailcontent);// 转 Document
String detailContent = detailDoc.select(".detail-text>div").get(0).text();
sb.append("内容---->>>");
sb.append(detailContent);
sb.append("\r\n");// 换行
sb.append("\r\n");// 换行
sb.append("\r\n");// 换行
}
writeToTxt(sb);
}
public static void writeToTxt(StringBuffer sb) {
FileWriter fw = null;
File desktopDir = FileSystemView.getFileSystemView().getHomeDirectory();
String desktopPath = desktopDir.getAbsolutePath();
try {
File f = new File(desktopPath + "/" + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "今日资讯.txt");
fw = new FileWriter(f, true);
} catch (IOException e) {
e.printStackTrace();
}
PrintWriter pw = new PrintWriter(fw);
pw.println(sb);
pw.flush();
try {
fw.flush();
pw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
2个工具类与前两篇一致,此处贴下
===============================================
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* httpclient 工具类
*/
public class HttpGetConnect {
/**
* 获取html内容
*
* @param url
* @param charsetName
* UTF-8、GB2312
* @return
* @throws IOException
*/
public static String connect(String url, String charsetName) throws IOException {
BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager();
CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(connManager).build();
String content = "";
try {
HttpGet httpget = new HttpGet(url);
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000).setConnectTimeout(50000)
.setConnectionRequestTimeout(50000).build();
httpget.setConfig(requestConfig);
httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpget.setHeader("Connection", "keep-alive");
httpget.setHeader("Upgrade-Insecure-Requests", "1");
httpget.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
httpget.setHeader("cache-control", "max-age=0");
httpget.setHeader("Referer", "https://www.kugou.com/song/");
// 设置cookie
httpget.setHeader("Cookie", "kg_mid=9393340fecff864a4d6c4e95099b2be1;");
CloseableHttpResponse response = httpclient.execute(httpget);
int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {
HttpEntity entity = response.getEntity();
InputStream instream = entity.getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(instream, charsetName));
StringBuffer sbf = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
sbf.append(line + "\n");
}
br.close();
content = sbf.toString();
} else {
content = "";
}
} catch (Exception e) {
e.printStackTrace();
} finally {
httpclient.close();
}
log.info("content is " + content);
return content;
}
private static Log log = LogFactory.getLog(HttpGetConnect.class);
}
===========================================
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* html manage 工具类
*/
public class HtmlManage {
public Document manage(String html) {
Document doc = Jsoup.parse(html);
return doc;
}
public Document manageDirect(String url) throws IOException {
Document doc = Jsoup.connect(url).get();
return doc;
}
public List<String> manageHtmlTag(Document doc, String tag) {
List<String> list = new ArrayList<String>();
Elements elements = doc.getElementsByTag(tag);
for (int i = 0; i < elements.size(); i++) {
String str = elements.get(i).html();
list.add(str);
}
return list;
}
public List<String> manageHtmlClass(Document doc, String clas) {
List<String> list = new ArrayList<String>();
Elements elements = doc.getElementsByClass(clas);
for (int i = 0; i < elements.size(); i++) {
String str = elements.get(i).html();
list.add(str);
}
return list;
}
public List<String> manageHtmlKey(Document doc, String key, String value) {
List<String> list = new ArrayList<String>();
Elements elements = doc.getElementsByAttributeValue(key, value);
for (int i = 0; i < elements.size(); i++) {
String str = elements.get(i).html();
list.add(str);
}
return list;
}
private static Log log = LogFactory.getLog(HtmlManage.class);
}
上图
至此结束