BS4的解析次序
htm2 = requests.get(url,headers)
soup = BeautifulSoup(htm2.content,'lxml')
books_lst = []
name = soup.find('div',class_='listmain')
#父div,子dl,孙dt(总标题)和dd(每一章)
if name:
dd_items = name.find('dl')
dt_num = 0
for n in dd_items.children:
ename = str(n.name).strip()
if ename == 'dt':
dt_num += 1
if ename != 'dd':
continue
books_info = {}
if dt_num == 2:
durls = n.find_all('a')[0]
books_info['name'] = durls.get_text()
books_info['url'] = 'http://www.biqukan.com' + durls.get('href')
books_lst.append(books_info)
return books_lst