xpath爬个扇贝单词
注意的是:
// 是全局查找
.// 是当前节点标签内全局查找
提示:
有多少页单词就能爬多少页,就是用这几行代码跳出死循环的,也就是说说捕获不到数据就说明页码到头了。
tr_list = tree.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
if not tr_list:
break
import requests
from lxml import etree
def getData(list):
if len(list) > 0 :
return list[0]
else:
return ''
# url = 'https://www.shanbay.com/wordlist/110521/232414/'
# url2 ='https://www.shanbay.com/wordlist/110521/232414/?page=2'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
page = 1
while True:
url = 'https://www.shanbay.com/wordlist/110521/232414/?page={}'.format(page)
page += 1
response = requests.request('get',url=url,headers=headers)
with open('word.html','w',encoding=response.encoding)as fp:
fp.write(response.text)
tree = etree.HTML(response.text)
# //是全局查找
tr_list = tree.xpath('//table[@class="table table-bordered table-striped"]/tbody/tr')
if not tr_list:
break
for tr in tr_list:
word_list = tr.xpath('.//strong/text()')
word = getData(word_list)
# .//是当前标签内全局查找
meaning_list = tr.xpath('.//td[@class="span10"]/text()')
meaning = getData(meaning_list)
print(word,meaning)