走过路过的大神帮忙看看:BS4爬取电影天堂的数据只有一半
问题:爬取电影天堂最新的电影,爬取结果只有当页数据的一半。
跪求大神帮忙指出问题?
代码如下:
from bs4 import BeautifulSoup
import requests
import time
import csv
def get_Html(url_f):
# 1、获取网页信息
headers = {‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36’,
‘Referer’: ‘https://www.dytt8.net/html/gndy/dyzz/index.html’
}
req = requests.get(url_f, headers=headers, timeout=10)
content = req.content
# 3、分析网页
soup = BeautifulSoup(content, 'lxml')
for li in soup.find_all('div', class_="co_content8"): # 选择所有class=co_area2 下的所有的 li 节点
for a in li.find_all('a'): # 选择 li 节点下的 ulink 节点
moviename = a.get_text()
url_1 = 'http://www.dytt8.net' + a['href'] #构造每个电影的网页链接
print(moviename)
print(url_1)#return
#详情页
req2 = requests.get(url_1, headers=headers)
content2 = req2.content
soup = BeautifulSoup(content2, 'html.parser')
for td in soup.find_all('td', attrs={'style': 'WORD-WRAP: break-word'}):
for url_2 in td.find_all('a'):
url_3 = url_2.string
print(url_3)
item={ #将获取的结果存储为字典,格式化输出(yield)
"name":moviename,
"link":url_1,
"link3":url_3
}
save_result(item) # 每次获取一个结果后,存储一次
item.clear() # 存储后清空字典,为下次存储做准备
#存储
def save_result(item):
#保存在TXT
#with open(‘result.txt’,‘a ‘)as f:
#f.write(json.dumps(content) + ‘\n’)
#f.close()
#保存在csv中
with open(‘dy.csv’, ‘a’, newline=’’, encoding=‘utf-8’) as csvfile: # 打开一个csv文件,用于存储
fieldnames = [‘name’, ‘link’,‘link3’]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(item)
def main():
with open(‘dy.csv’, ‘a’, newline=’’) as csvfile: # 写入表头
writer = csv.writer(csvfile)
writer.writerow([‘name’, ‘link’,‘link3’])
# 网址及翻页方法1:
#url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
#for i in range(1,3):
#url_f=url.format(i)
#get_Html(url_f)
#time.sleep(2)
# 网址及翻页方法2:
urls = ['https://www.dy2018.com/5/index.html',
'https://www.dy2018.com/5/index_2.html',
]
for url_f in urls:
get_Html(url_f)
time.sleep(2)
if name ==‘main’:
main()