HtmlAgilityPack爬虫实战-百度经验悬赏爬取
完整代码如下:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using Maticsoft.BLL;
using Maticsoft.Model;
using System.Net;
using System.IO;
using System.Runtime.InteropServices;
using Maticsoft.DBUtility;
namespace getXsjy_爬取悬赏经验_
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
string url = "https://jingyan.baidu.com/patch";
HtmlAgilityPack.HtmlWeb webClient = new HtmlAgilityPack.HtmlWeb();
bl_ddlbn lb_ddlb = new bl_ddlbn();
int pagesize = 15;
private void Form1_Load(object sender, EventArgs e)
{
webClient.OverrideEncoding = Encoding.GetEncoding("utf-8");
}
/// <summary>
/// 延时函数
/// </summary>
/// <param name="delayTime">需要延时多少秒</param>
/// <returns></returns>
public static bool Delay(int delayTime)
{
DateTime now = DateTime.Now;
int s;
do
{
TimeSpan spand = DateTime.Now - now;
s = spand.Seconds;
}
while (s < delayTime);
return true;
}
private void GetXsjy()
{
List<ddlbn> modellist = lb_ddlb.GetModelList(" lb='1'");
lb_ddlb.Delete("2");
foreach (ddlbn model in modellist)
{
string ls_url = url + "?cid=" + model.dm;
for (int i = 0; i < int.Parse(tbx_page.Text); i++)
{
string ls_par = (pagesize * i).ToString();
if (ls_url.IndexOf("&pn=")>0)
{
ls_url = ls_url.Substring(0, ls_url.IndexOf("&pn=")) + " &pn=" + ls_par;
}
else
ls_url = ls_url + " &pn=" + ls_par;
List<ddlbn> pagelist = new List<ddlbn>();//一页数据提交一次
HtmlAgilityPack.HtmlDocument doc = webClient.Load(ls_url);
HtmlAgilityPack.HtmlNodeCollection colist = doc.DocumentNode.SelectNodes("//*[@class='li-par']");
if (colist == null || colist.Count == 0)
break;
foreach (HtmlAgilityPack.HtmlNode no in colist)
{
HtmlAgilityPack.HtmlNode row = no.ChildNodes[0].SelectSingleNode("a[@class='title query-item-id']");
HtmlAgilityPack.HtmlNode cash = no.ChildNodes[0].SelectSingleNode("span[@class='cash']");
// HtmlAgilityPack.HtmlNode cash1 = no.ChildNodes[0].SelectSingleNode("cash");
//*[@id="body"]/section/div[1]/div[2]/div/div[2]/ul/div[2]/li/span[2]
ddlbn newmodel = new ddlbn();
newmodel.lb = "2";//经验明细
newmodel.price = cash.InnerText;
newmodel.mc = row.InnerText;
newmodel.dm = row.Attributes["data-queryid"].Value;
newmodel.detail = ls_url;
if(lb_ddlb.Exists(newmodel.dm, newmodel.lb))
{
lb_ddlb.Delete(newmodel.dm, newmodel.lb);
//label4.Text="爬取数据:" + newmodel.mc+",单价:" + newmodel.price;
}
pagelist.Add(newmodel);
}
Delay(3);
lb_ddlb.Add(pagelist);
}
}
while(DateTime.Now.Hour>=0&& DateTime.Now.Hour<=7)
{
System.Threading.Thread.Sleep(1000 * 60 * 60);
}
System.Threading.Thread.Sleep(1000*60*Convert.ToInt16( textBox2.Text));//20分钟同步一次
GetXsjy();
}
private void getMl()
{
DbHelperSQL.ExecuteSql("delete from ddlbn where lb='2'");
HtmlAgilityPack.HtmlDocument doc = webClient.Load(url);
HtmlAgilityPack.HtmlNodeCollection colist = doc.DocumentNode.SelectNodes("//*[@id='typeList']");
List<Maticsoft.Model.ddlbn> modellist = new List<Maticsoft.Model.ddlbn>();
foreach (HtmlAgilityPack.HtmlNode node in colist)
{
HtmlAgilityPack.HtmlNodeCollection ls_a = node.SelectNodes(".//a[@href]");
foreach (HtmlAgilityPack.HtmlNode no in ls_a)
{
ddlbn model = new ddlbn();
model.lb = "1";//经验分类
model.mc = no.InnerText;
model.dm = no.GetAttributeValue("href", null).Split('=')[1];
modellist.Add(model);
}
}
lb_ddlb.Add(modellist);
}
private void button1_Click(object sender, EventArgs e)
{
GetXsjy();
}
private void button2_Click(object sender, EventArgs e)
{
getMl();
}
}
}
界面如下,主要分两个功能,经验分类目录爬取和明细数据爬取。代码比较简单,就不详细介绍了。