java新闻爬取

本来想爬今日头条,在网上找了很多方法,走了很多弯路,异步刷新没能解决,本人爬虫小白。后来发现json数据和本地cookie也有关,感觉前路艰难。果断换到网易新闻,网易新闻相对来说获取数据比较简单,通过谷歌F12分析包数据,发现网易异步刷新的包和访问路径有关,通过在线json解析数据发现可以解析,这让我欣喜不已。

java新闻爬取java新闻爬取java新闻爬取
json数据:
java新闻爬取java新闻爬取java新闻爬取
废话不多说,直接上代码
//网易新闻类型
String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};
String type = typeArray[width];

//网易新闻列表url
String url1 = "http://3g.163.com/touch/reconstruct/article/list/";
//网易新闻内容url
String url2 = "http://3g.163.com/news/article/";

//根据新闻列表url,获取新闻docid,并把docid存储到list中
private static List<String> getDocid(String url,int num,String type) {
    String json = null;
    List<String> id=new ArrayList<>();
    Map map=null;
    JSONArray parseArray=null;
    String jsonStrM="";
    json = JSONUtils.loadJson(url+type+"/"+num+"-10.html");
    String jsonStr = StringUtils.substringBeforeLast(json, ")");
    String jsonStrO = StringUtils.substringAfter(jsonStr,"artiList(");
    Map parse = (Map) JSONObject.parse(jsonStrO);
    parseArray = (JSONArray) parse.get(type);
    for(int j=0;j<parseArray.size();j++){
        map = (Map)parseArray.get(j);
        id.add((String) map.get("docid"));
    }
    return id;
}

//根据内容url2获取新闻信息并进行存储
private static void getContent(String url2, List<String> ids) {
    System.out.println("存储开始!!");
    String url = null;
    Connection connection = Jsoup.connect(url2);
    int i = 1;
    for (;i<ids.size();i++){
        url = url2+ids.get(i)+".html";
        connection = Jsoup.connect(url);
        try {
            Document document = connection.get();
            //获取新闻标题
            Elements title = document.select("[class=title]");
            //获取新闻来源和文章发布时间
            Elements articleInfo = document.select("[class=info]");
            Elements src = articleInfo.select("[class=source js-source]");
            Elements time = articleInfo.select("[class=time js-time]");
            //获取新闻内容
            Elements contentEle = document.select("[class=page js-page on]");
            DBCollection dbCollection= null;
            try {
                dbCollection = MongoDBUtils.connMongoDB();
            } catch (Exception e) {
                e.printStackTrace();
            }
            BasicDBObject obj = new BasicDBObject();
            obj.put("title", src.html());
            obj.put("srcFrom", src.html());
            obj.put("time", time.html());
            obj.put("content", contentEle.html());
            dbCollection.insert(obj);
            DBCursor dbCursor = dbCollection.find();
            while(dbCursor.hasNext()){
                Map map = (Map)dbCursor.next();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    System.out.println("本次共计存储"+i*0.8+"条数据");
}

//设置爬取深度,循环多次获取docid
private static List<String> getIds(String url1,int num,String type) {
    List<String> id = new ArrayList<>();
    List<String> ids = new ArrayList<>();
    for (int i=0;i<=num;i+=10){
        id = getDocid(url1,i,type);
        ids.addAll(id);
    }
    return ids;
}

public static void main(String[] args) throws Exception {
    //爬取条数,10的倍数,网易新闻每10条预留大约2个广告位,所以爬取新闻的真实条数大约为80%
    int deep = 30;
    //爬取宽度,0:首页,1:社会,2:国内,3:国际,4:历史
    int width = 1;

    //网易新闻类型
    String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"};
    String type = typeArray[width];

    //网易新闻列表url
    String url1 = "http://3g.163.com/touch/reconstruct/article/list/";
    //网易新闻内容url
    String url2 = "http://3g.163.com/news/article/";


    List<String> ids = new ArrayList<>();

    //根据url1,爬取条数,新闻类型获取新闻docid
    ids = getIds(url1,deep,type);
    //根据url2,新闻docid获取内容并存储到MongoDB
    getContent(url2,ids);
}

为了方便存取比较大的数据量,使用了mongodb数据库进行存储
列表
java新闻爬取java新闻爬取java新闻爬取
内容
java新闻爬取java新闻爬取java新闻爬取