selenium爬取拉勾网数据并进行可视化分析

"""
Created by Young on 2019/1/23 10:25
"""
import pymongo
from selenium import webdriver
from lxml import etree
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import re

driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
url = 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?oquery=python%E5%90%8E%E7%AB%AF&fromSearch=true&labelWords=relative'


client = pymongo.MongoClient('localhost',27017)
lagou = client['lagou']
meishi_info = lagou['lagou_job']

def job_link(url):

    driver.get(url)
    while True:
        source = driver.page_source
        time.sleep(2)
        page_list(source)
        next_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#s_position_list > div.item_con_pager > div > span.pager_next')))
        if 'pager_next pager_next_disabled' in next_btn.get_attribute('class'):
            break
        else:
            next_btn.click()
            time.sleep(2)

def page_list(source):
    html = etree.HTML(source)
    links = html.xpath('//a[@class="position_link"]/@href')
    for link in links:
        request_detail(link)


def request_detail(url):

    driver.execute_script("window.open('%s')" % url)
    driver.switch_to_window(driver.window_handles[1])
    source = driver.page_source
    time.sleep(2)
    job_detail(source)
    driver.close()
    driver.switch_to_window(driver.window_handles[0])

def job_detail(source):
    try:
        datas = []
        html = etree.HTML(source)
        company = html.xpath('//div[@class="company"]/text()')[0]
        position_name = html.xpath('//span[@class="name"]/text()')[0]
        job_request_span = html.xpath('//dd[@class="job_request"]//span')
        salary = job_request_span[0].xpath('.//text()')[0].strip()
        address = job_request_span[1].xpath('.//text()')[0].strip()
        address = re.sub(r'[\s/]', '', address)
        work_years = job_request_span[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r'[\s/]', '', work_years)
        education = job_request_span[3].xpath('.//text()')[0].strip()
        education = re.sub(r'[\s/]','',education)
        work_time = job_request_span[4].xpath('.//text()')[0].strip()
        work_time = re.sub(r'[\s/]','',work_time)
        advantage = html.xpath('//dd[@class="job-advantage"]/p/text()')
        desc = ''.join(html.xpath('//dd[@class="job_bt"]//text()')).strip()
        desc = re.sub(r"职位描述：\n        \n       ",'',desc)
        data = {
            '【公司名】': company ,
            '【职位】': position_name,
            '【薪资】': salary,
            '【工作地点】': address,
            '【经验】': work_years,
            '【教育水平】': education,
            '【工作类型】': work_time,
            '【职位诱惑】' : advantage,
            '【职位描述】' : desc
        }
        datas.append(data)
        print(datas)
        print('**' * 30)
        save_to_mongo(data)
    except TimeoutException:
        return print('超时')

def save_to_mongo(result):
    if lagou['lagou_job'].insert(result):
        print('-----------------------------成功存储到MongoDB-------------------------------------\n', result)
        print('**************************************************************************************')
        return True
    return False

def main():
    job_link(url)

if __name__ == '__main__':
    main()
selenium爬取拉勾网数据并进行可视化分析

相关推荐