爬虫学习-task4

实战大项目：模拟登录丁香园，并抓取论坛页面所有的人员基本信息与回复帖子内容

#模拟登录丁香园
import time,requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import etree
def login():
    browser = webdriver.Chrome()
    browser.get('https://auth.dxy.cn/accounts/login')
    #点击返回电脑登录
    browser.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/a[2]').click()
    input_name= browser.find_element_by_name('username')#找到用户名输入框
    input_name.send_keys('Tinkle_sunshine')#输入自己用户名
    input_password = browser.find_element_by_name('password')#找到密码输入框  
    input_password.send_keys('12346')#输入自己的邮箱密码
    browser.find_element_by_xpath('//*[@id="user"]/div[1]/div[3]/button').click()  #点击登陆按钮
    time.sleep(4)
    cookie = browser.get_cookies()
    cookie_dict = {i['name']:i['value'] for i in cookie}
    return cookie_dict

def get_contents():
    cookies=login()
    headers = {'User-Agent':"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) "}
    response=requests.get(url='http://www.dxy.cn/bbs/thread/626626#626626',headers=headers,cookies=cookies)
    html=etree.HTML(response.text)
    users = html.xpath('//div[@class="auth"]')
    contents=html.xpath('//td[@class="postbody"]')
    for i in range(0,len(users)):
        user=users[i].xpath('string(.)')
        content=contents[i].xpath('string(.)').strip()
        print(user+':')
        print(content)
        print('——'*100)
        result=user+':'+content
        write_file=open('contents.txt','w',encoding="utf-8")
        write_file.write(result+"\n")
        write_file.write('-'*80+"\n")
        write_file.close()
        
if __name__ == '__main__':
    get_contents()

结果：
爬虫学习-task4

相关推荐