java新闻爬取
本来想爬今日头条,在网上找了很多方法,走了很多弯路,异步刷新没能解决,本人爬虫小白。后来发现json数据和本地cookie也有关,感觉前路艰难。果断换到网易新闻,网易新闻相对来说获取数据比较简单,通过谷歌F12分析包数据,发现网易异步刷新的包和访问路径有关,通过在线json解析数据发现可以解析,这让我欣喜不已。
json数据:
废话不多说,直接上代码
//网易新闻类型 String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"}; String type = typeArray[width]; //网易新闻列表url String url1 = "http://3g.163.com/touch/reconstruct/article/list/"; //网易新闻内容url String url2 = "http://3g.163.com/news/article/";
//根据新闻列表url,获取新闻docid,并把docid存储到list中 private static List<String> getDocid(String url,int num,String type) { String json = null; List<String> id=new ArrayList<>(); Map map=null; JSONArray parseArray=null; String jsonStrM=""; json = JSONUtils.loadJson(url+type+"/"+num+"-10.html"); String jsonStr = StringUtils.substringBeforeLast(json, ")"); String jsonStrO = StringUtils.substringAfter(jsonStr,"artiList("); Map parse = (Map) JSONObject.parse(jsonStrO); parseArray = (JSONArray) parse.get(type); for(int j=0;j<parseArray.size();j++){ map = (Map)parseArray.get(j); id.add((String) map.get("docid")); } return id; }
//根据内容url2获取新闻信息并进行存储 private static void getContent(String url2, List<String> ids) { System.out.println("存储开始!!"); String url = null; Connection connection = Jsoup.connect(url2); int i = 1; for (;i<ids.size();i++){ url = url2+ids.get(i)+".html"; connection = Jsoup.connect(url); try { Document document = connection.get(); //获取新闻标题 Elements title = document.select("[class=title]"); //获取新闻来源和文章发布时间 Elements articleInfo = document.select("[class=info]"); Elements src = articleInfo.select("[class=source js-source]"); Elements time = articleInfo.select("[class=time js-time]"); //获取新闻内容 Elements contentEle = document.select("[class=page js-page on]"); DBCollection dbCollection= null; try { dbCollection = MongoDBUtils.connMongoDB(); } catch (Exception e) { e.printStackTrace(); } BasicDBObject obj = new BasicDBObject(); obj.put("title", src.html()); obj.put("srcFrom", src.html()); obj.put("time", time.html()); obj.put("content", contentEle.html()); dbCollection.insert(obj); DBCursor dbCursor = dbCollection.find(); while(dbCursor.hasNext()){ Map map = (Map)dbCursor.next(); } } catch (IOException e) { e.printStackTrace(); } } System.out.println("本次共计存储"+i*0.8+"条数据"); }
//设置爬取深度,循环多次获取docid private static List<String> getIds(String url1,int num,String type) { List<String> id = new ArrayList<>(); List<String> ids = new ArrayList<>(); for (int i=0;i<=num;i+=10){ id = getDocid(url1,i,type); ids.addAll(id); } return ids; }
public static void main(String[] args) throws Exception { //爬取条数,10的倍数,网易新闻每10条预留大约2个广告位,所以爬取新闻的真实条数大约为80% int deep = 30; //爬取宽度,0:首页,1:社会,2:国内,3:国际,4:历史 int width = 1; //网易新闻类型 String[] typeArray={"BBM54PGAwangning","BCR1UC1Qwangning","BD29LPUBwangning","BD29MJTVwangning","C275ML7Gwangning"}; String type = typeArray[width]; //网易新闻列表url String url1 = "http://3g.163.com/touch/reconstruct/article/list/"; //网易新闻内容url String url2 = "http://3g.163.com/news/article/"; List<String> ids = new ArrayList<>(); //根据url1,爬取条数,新闻类型获取新闻docid ids = getIds(url1,deep,type); //根据url2,新闻docid获取内容并存储到MongoDB getContent(url2,ids); }
为了方便存取比较大的数据量,使用了mongodb数据库进行存储
列表
内容