java爬取资讯（新闻）

本篇博客只是交流学习，如有不妥请联系删除

续接前两篇爬图片和音乐，本次爬了一些文字信息，如果后期有需要再把图片加上

https://blog.****.net/m0_37615458/article/details/103867889

https://blog.****.net/m0_37615458/article/details/103902165

刚才简单的把资讯爬了一下，比较简单的抓取列表页的标题和详情页的文字内容，存到桌面上txt文件

jar包支撑与前两篇一致，不再赘述。

抓取的代码如下

================================================

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import javax.swing.filechooser.FileSystemView;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import spiderkugou.HtmlManage;
import spiderkugou.HttpGetConnect;

/**
* @author dongwn 拟抓取页面的资讯数据
*/
public class TestGetInformation {

static String url = "http://news.chemnet.com/list-11-11-PAGE.html";// 列表页

   public static void main(String[] args) throws IOException, ParseException, InterruptedException {
       String newUrl = "";// 动态url
       for (int i = 1; i < 50; i++) {// 只抓50页吧
           newUrl = url.replace("PAGE", i + "");
           getConnection(newUrl);
           Thread.sleep(1000);
       }
   }

   // 获取链接获取元素
   @SuppressWarnings("static-access")
   public static void getConnection(String url) throws IOException, ParseException {
       StringBuffer sb = new StringBuffer();
       HttpGetConnect connect = new HttpGetConnect();
       String content = connect.connect(url, "utf-8");
       HtmlManage html = new HtmlManage();
       Document doc = html.manage(content);// 转 Document
       Elements elements = doc.select(".content-list>ul>li");
       sb.append(System.getProperty("user.name") + "----->>>"
               + new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").format(new Date()));
       sb.append("\r\n");// 换行
       sb.append("\r\n");// 换行
       for (Element ele : elements) {
           sb.append("标题---->>>");
           sb.append(ele.select("a").text().trim());// 获取标题
           sb.append("\r\n");// 换行
           String detailUrl = "http://news.chemnet.com" + ele.select("a").attr("href");
           String detailcontent = connect.connect(detailUrl, "utf-8");
           HtmlManage detailhtml = new HtmlManage();
           Document detailDoc = detailhtml.manage(detailcontent);// 转 Document
           String detailContent = detailDoc.select(".detail-text>div").get(0).text();
           sb.append("内容---->>>");
           sb.append(detailContent);
           sb.append("\r\n");// 换行
           sb.append("\r\n");// 换行
           sb.append("\r\n");// 换行
       }
       writeToTxt(sb);
   }

   public static void writeToTxt(StringBuffer sb) {
       FileWriter fw = null;
       File desktopDir = FileSystemView.getFileSystemView().getHomeDirectory();
       String desktopPath = desktopDir.getAbsolutePath();
       try {
           File f = new File(desktopPath + "/" + new SimpleDateFormat("yyyy-MM-dd").format(new Date()) + "今日资讯.txt");
           fw = new FileWriter(f, true);
       } catch (IOException e) {
           e.printStackTrace();
       }
       PrintWriter pw = new PrintWriter(fw);
       pw.println(sb);
       pw.flush();
       try {
           fw.flush();
           pw.close();
           fw.close();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

}

2个工具类与前两篇一致，此处贴下

===============================================

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

/**
* httpclient 工具类
*/
public class HttpGetConnect {

   /**
   * 获取html内容
   *
   * @param url
   * @param charsetName
   * UTF-8、GB2312
   * @return
   * @throws IOException
   */
   public static String connect(String url, String charsetName) throws IOException {
       BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager();

CloseableHttpClient httpclient = HttpClients.custom().setConnectionManager(connManager).build();
String content = "";

try {
HttpGet httpget = new HttpGet(url);

           RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(5000).setConnectTimeout(50000)
                   .setConnectionRequestTimeout(50000).build();
           httpget.setConfig(requestConfig);
           httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
           httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
           httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
           httpget.setHeader("Connection", "keep-alive");
           httpget.setHeader("Upgrade-Insecure-Requests", "1");
           httpget.setHeader("User-Agent",
                   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
           httpget.setHeader("cache-control", "max-age=0");

httpget.setHeader("Referer", "https://www.kugou.com/song/");

// 设置cookie
httpget.setHeader("Cookie", "kg_mid=9393340fecff864a4d6c4e95099b2be1;");

CloseableHttpResponse response = httpclient.execute(httpget);

int status = response.getStatusLine().getStatusCode();
if (status >= 200 && status < 300) {

               HttpEntity entity = response.getEntity();
               InputStream instream = entity.getContent();
               BufferedReader br = new BufferedReader(new InputStreamReader(instream, charsetName));
               StringBuffer sbf = new StringBuffer();
               String line = null;
               while ((line = br.readLine()) != null) {
                   sbf.append(line + "\n");
               }

               br.close();
               content = sbf.toString();
           } else {
               content = "";
           }

       } catch (Exception e) {
           e.printStackTrace();
       } finally {
           httpclient.close();
       }
       log.info("content is " + content);
       return content;
   }

private static Log log = LogFactory.getLog(HttpGetConnect.class);
}

===========================================

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* html manage 工具类
*/
public class HtmlManage {

   public Document manage(String html) {
       Document doc = Jsoup.parse(html);
       return doc;
   }

   public Document manageDirect(String url) throws IOException {
       Document doc = Jsoup.connect(url).get();
       return doc;
   }

public List<String> manageHtmlTag(Document doc, String tag) {
List<String> list = new ArrayList<String>();

       Elements elements = doc.getElementsByTag(tag);
       for (int i = 0; i < elements.size(); i++) {
           String str = elements.get(i).html();
           list.add(str);
       }
       return list;
   }

public List<String> manageHtmlClass(Document doc, String clas) {
List<String> list = new ArrayList<String>();

       Elements elements = doc.getElementsByClass(clas);
       for (int i = 0; i < elements.size(); i++) {
           String str = elements.get(i).html();
           list.add(str);
       }
       return list;
   }

public List<String> manageHtmlKey(Document doc, String key, String value) {
List<String> list = new ArrayList<String>();

       Elements elements = doc.getElementsByAttributeValue(key, value);
       for (int i = 0; i < elements.size(); i++) {
           String str = elements.get(i).html();
           list.add(str);
       }
       return list;
   }

private static Log log = LogFactory.getLog(HtmlManage.class);
}

上图

java爬取资讯（新闻）

至此结束

java爬取资讯（新闻）

相关推荐