urllib及re模块爬取51job的职位信息
https://search.51job.com/list/060000,000000,0000,00,9,99,python,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
这是要爬取的网址
需要爬取这些内容,不多说,直接开始写了。
import re import random import urllib.request import xlwt
# 获取html页面信息 def getHtml(): url = 'https://search.51job.com/list/060000,000000,0000,00,9,99,python,2,1.html?l' \ 'ang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&' \ 'companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate' \ '=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' req = urllib.request.Request(url) proxies = ["121.31.159.231:8123","110.73.10.193:8123","110.73.10.204:8123"] proxy_handler = urllib.request.ProxyHandler({"http":random.choice(proxies)}) opener = urllib.request.build_opener(proxy_handler) urllib.request.install_opener(opener) response = urllib.request.urlopen(req) html = response.read().decode('gbk') # print(html) return html # getHtml() # 获取想要爬取的数据
def getdata(html): reg = re.compile(r'class="t1 ">.*?<a target="_blank" title="(.*?)".*? <span class="t2"><a target="_blank" title="(.*?)".*? <span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',re.S) items = re.findall(reg,html) # print(items) return items
dataList = [] # 存入dataList def saveDataList(): html = getHtml() items = getdata(html) for i in items: data = [] for j in range(5): data.append(i[j]) dataList.append(data) return # 存入Excel文件中 def saveExcel(path): #创建Excel文件 book = xlwt.Workbook() sheet = book.add_sheet('51job职位信息') col = [u'职位名',u'公司名',u'工作地点',u'薪资',u'发布时间'] for i in range(5): sheet.write(0,i,col[i]) for i in range(len(dataList)): data = dataList[i] for j in range(5): sheet.write(i+1,j,data[j]) book.save(path)
saveDataList() saveExcel('51job.xls')