python爬虫爬取github项目里的评论

这几天因为实验需要,对github上的bitcoin里的评论信息进行了爬取。现在贴出源码:

import urllib.request
import re
from bs4 import BeautifulSoup
import io
import sys
import openpyxl

record=[]

def gettitle(page=1):
	try:
		#sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='UTF-8')
		url="https://github.com/bitcoin/bitcoin/issues?page="+str(page)+"&q=is%3Aopen+is%3Aissue"
		data = urllib.request.urlopen(url).read()
		z_data = data.decode('UTF-8')
		soup = BeautifulSoup(z_data, 'lxml')
		a = soup.select('li > div > div > a')
		b=soup.select('span.opened-by')
		c=soup.select('relative-time')
		test=soup.select('div.float-left.col-9.lh-condensed.p-2')
		#hostsfile = open('record.txt', 'w', newline='',encoding='UTF-8')
		for i in range(0,len(b)):
			temp=[]
			temp.append(a[i].get_text())
			temp.append("opened")
			temp.append(c[i].attrs['datetime'])
			z=""
			for j in test[i].select('a.d-inline-block.IssueLabel.v-align-text-top'):
				z+=j.get_text()+'/'
			temp.append(z)
			#sn=b[i].get_text().replace(" ","").split('\n')[1].replace("#","").replace("\n","")
			m = re.search('\d+',b[i].get_text())
			temp.append(getdata(m.group(0)))
			record.append(temp)
		#hostsfile.close()
		print('hosts刷新成功:',len(a))
	except Exception as err:
		print(str(err))

def getdata(sn):
	value=""
	try:
		url="https://github.com/bitcoin/bitcoin/issues/"+str(sn)
		data = urllib.request.urlopen(url).read()
		z_data = data.decode('UTF-8')
		soup = BeautifulSoup(z_data, 'lxml')
		a = soup.select('table > tbody > tr > td')
		#hostsfile = open('record.txt', 'w', newline='')
		for i in a:
			value=value+i.get_text()+ "\n\r"
			#hostsfile.write(value)
		#hostsfile.close()
		#print('hosts刷新成功:',len(a))
	except Exception as err:
		print(str(err))
	return value

def write07Excel(path,value):
    wb = openpyxl.Workbook()
    sheet = wb.active
    sheet.title = 'Sheet1'
    for i in range(0, len(value)):
        for j in range(0, len(value[i])):
            sheet.cell(row=i+1, column=j+1, value=str(value[i][j]))
    wb.save(path)
    #print("写入数据成功!")

if __name__=="__main__":
	for i in range(1,24):
		gettitle(i)
		print("第"+str(i)+"页抓取完成")
	write07Excel("open.xlsx",record)

爬取的数据主要包括

1.评论主题

2.评论时间

3.评论标签

4.评论内容

如下图所示:

python爬虫爬取github项目里的评论

将爬取的数据最后写入excel文件中。

 

程序源码下载地址:https://github.com/XiaoYaoNet/reptile