python爬虫3——爬取腾讯招聘全部招聘信息
python爬虫2中,已经有了初步的代码,之后做了优化
增加了工作职责、工作要求:
获取的数据有:
代码如下:
#!/usr/bin/env python # -*- coding:utf-8 -*- from bs4 import BeautifulSoup import urllib2 import urllib import json # 使用了json格式存储 def tengxun(detail,num): url = 'https://hr.tencent.com/' # detail = 'position.php?&start=0#a' request = urllib2.Request(url + detail) response =urllib2.urlopen(request) resHtml = response.read() soup = BeautifulSoup(resHtml,'html.parser', from_encoding='utf-8') result = soup.select(".even") result += soup.select(".odd") # print len(result) # 处理页面 items = [] for node in result: item = {} # 职位名 zname = node.select('td')[0].get_text() # 职位类别 ztype = node.select('td')[1].get_text() # 人数 znum = node.select('td')[2].get_text() # 地点 zlocal = node.select('td')[3].get_text() # 发布时间 ztime = node.select('td')[4].get_text() # 链接 detailLink = node.select('td a')[0].attrs['href'] # 获取工作职责、工作要求 request1 = urllib2.Request(url + detailLink) response1 = urllib2.urlopen(request1) jobHtml = response1.read() soup1 = BeautifulSoup(jobHtml, 'html.parser', from_encoding='utf-8') # print len(soup1.select('ul.squareli')) # 工作职责 jobRes = '' for li in soup1.select('ul.squareli')[0].select('li') : jobRes += li.get_text() + '\n' # 工作要求 jobReq = '' for li in soup1.select('ul.squareli')[1].select('li') : jobReq += li.get_text() + '\n' # print jobReq # 将数据存入item中 item['zname']=zname; item['detailLink'] = detailLink; item['ztype']=ztype item['znum'] = znum item['zlocal'] = zlocal item['ztime'] = ztime item['jobRes'] = jobRes item['jobReq'] = jobReq # 处理工作职责和工作要求 items.append(item) origin = [] print(len(items)) # 以json格式输出到文件中 # 禁用ascii编码,按utf-8编码 output = open('tencent.json'+ str(num), 'w') for i in origin: items.append(i) line = json.dumps(items, ensure_ascii=False); # print line output.write(line.encode('utf-8')) output.close() # print resHtml for i in range(303): print("进行到第" + str(i) + "页") url = 'position.php?&start='+ str(i * 10) +'#a' tengxun(url, i)
取出来的json数据: