爬虫学习-task4
- 实战大项目:模拟登录丁香园,并抓取论坛页面所有的人员基本信息与回复帖子内容
#模拟登录丁香园
import time,requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from lxml import etree
def login():
browser = webdriver.Chrome()
browser.get('https://auth.dxy.cn/accounts/login')
#点击返回电脑登录
browser.find_element_by_xpath('/html/body/div[2]/div[2]/div[1]/a[2]').click()
input_name= browser.find_element_by_name('username')#找到用户名输入框
input_name.send_keys('Tinkle_sunshine')#输入自己用户名
input_password = browser.find_element_by_name('password')#找到密码输入框
input_password.send_keys('12346')#输入自己的邮箱密码
browser.find_element_by_xpath('//*[@id="user"]/div[1]/div[3]/button').click() #点击登陆按钮
time.sleep(4)
cookie = browser.get_cookies()
cookie_dict = {i['name']:i['value'] for i in cookie}
return cookie_dict
def get_contents():
cookies=login()
headers = {'User-Agent':"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) "}
response=requests.get(url='http://www.dxy.cn/bbs/thread/626626#626626',headers=headers,cookies=cookies)
html=etree.HTML(response.text)
users = html.xpath('//div[@class="auth"]')
contents=html.xpath('//td[@class="postbody"]')
for i in range(0,len(users)):
user=users[i].xpath('string(.)')
content=contents[i].xpath('string(.)').strip()
print(user+':')
print(content)
print('——'*100)
result=user+':'+content
write_file=open('contents.txt','w',encoding="utf-8")
write_file.write(result+"\n")
write_file.write('-'*80+"\n")
write_file.close()
if __name__ == '__main__':
get_contents()
结果: