python爬虫学习之使用BeautifulSoup库爬取开奖网站信息-模块化
实例需求:运用python语言爬取http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html这个开奖网站所有的信息,并且保存为txt文件和excel文件。
实例环境:python3.7
BeautifulSoup库、xlwt库(需手动安装)
urllib库、re库(内置的python库,无需手动安装)
实例网站:
第一步,点击链接http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html进入网站,查看网站基本信息,注意一共要爬取118页数据。
第二步,查看网页源代码,熟悉网页结构,标签等信息。
实例思路:
一个爬虫程序的结构:
1、调度模块(Scheduler):安排发起网络请求的策略
2、网络模块(network):发起网络请求,并接受服务器返回
3、爬虫模块(Spider):解析、爬取数据
4、Item模块:定义爬取的数据项
5、Piplines模块:对已经爬取的数据做后续处理(存入数据库、存入文件系统、传递给流式处理框架,等等)
下面的示例程序基本实现了上述几个模板
实例代码:
getWinningNum.py
#encoding=utf-8 import re from bs4 import BeautifulSoup import urllib.request from save2excel import SavaBallDate #4、 Item模块 定义爬取的数据项 class DoubleColorBallItem(object): date = None order = None red1 = None red2 = None red3 = None red4 = None red5 = None red6 = None blue = None money = None firstPrize = None secondPrize = None class GetDoubleColorBallNumber(object): def __init__(self): self.urls = [] self.urls = self.getUrls() self.items = self.spider(self.urls) self.pipelines(self.items) SavaBallDate(self.items) # 获取 urls 的函数 def getUrls(self): URL = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html' htmlContent = self.getResponseContent(URL) soup = BeautifulSoup(htmlContent, 'lxml') tag = soup.find_all('p')[-1] pages = tag.strong.get_text() for i in range(1, int(pages)+1): url = r'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html' self.urls.append(url) return self.urls #3、 网络模块(NETWORK)发起网络请求,并接受服务器返回 def getResponseContent(self, url): try: response = urllib.request.urlopen(url) except URLError as e: raise e else: return response.read().decode("utf-8") #3、 爬虫模块(Spider) 解析、爬取数据 def spider(self,urls): items = [] for url in urls: try: htmlContent = self.getResponseContent(url) soup = BeautifulSoup(htmlContent, 'lxml') tags = soup.find_all('tr', attrs={}) for tag in tags: if tag.find('em'): item = DoubleColorBallItem() tagTd = tag.find_all('td') item.date = tagTd[0].get_text() item.order = tagTd[1].get_text() tagEm = tagTd[2].find_all('em') item.red1 = tagEm[0].get_text() item.red2 = tagEm[1].get_text() item.red3 = tagEm[2].get_text() item.red4 = tagEm[3].get_text() item.red5 = tagEm[4].get_text() item.red6 = tagEm[5].get_text() item.blue = tagEm[6].get_text() item.money = tagTd[3].find('strong').get_text() item.firstPrize = tagTd[4].find('strong').get_text() item.secondPrize = tagTd[5].find('strong').get_text() items.append(item) except Exception as e: raise e # print(str(e)) return items # Piplines模块:对已经爬取的数据做后续处理(存入数据库、存入文件系统、传递给流式处理框架,等等) def pipelines(self,items): fileName = u'双色球.txt' with open(fileName, 'w') as fp: # a 为追加 w 为覆盖若存在 for item in items: fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n' %(item.date,item.order,item.red1,item.red2,item.red3,item.red4,item.red5,item.red6,item.blue,item.money,item.firstPrize,item.secondPrize)) if __name__ == '__main__': GDCBN = GetDoubleColorBallNumber()
save2excel.py
#encoding=utf-8 import xlwt class SavaBallDate(object): def __init__(self, items): self.items = items self.run(self.items) def run(self,items): fileName = u'双色球.xls' book = xlwt.Workbook(encoding='utf-8') sheet=book.add_sheet('ball', cell_overwrite_ok=True) sheet.write(0, 0, u'开奖日期') sheet.write(0, 1, u'期号') sheet.write(0, 2, u'红1') sheet.write(0, 3, u'红2') sheet.write(0, 4, u'红3') sheet.write(0, 5, u'红4') sheet.write(0, 6, u'红5') sheet.write(0, 7, u'红6') sheet.write(0, 8, u'蓝') sheet.write(0, 9, u'销售金额') sheet.write(0, 10, u'一等奖') sheet.write(0, 11, u'二等奖') i = 1 while i <= len(items): item = items[i-1] sheet.write(i, 0, item.date) sheet.write(i, 1, item.order) sheet.write(i, 2, item.red1) sheet.write(i, 3, item.red2) sheet.write(i, 4, item.red3) sheet.write(i, 5, item.red4) sheet.write(i, 6, item.red5) sheet.write(i, 7, item.red6) sheet.write(i, 8, item.blue) sheet.write(i, 9, item.money) sheet.write(i, 10, item.firstPrize) sheet.write(i, 11, item.secondPrize) i += 1 book.save(fileName) if __name__ == '__main__': pass
实例结果:
数据量有点大,可能需要等一会儿,下面为程序运行结束后的文件夹。
__pycache__文件夹为程序运行自动生成的文件夹,不用管。