笔记(二)网络爬取的高级技巧:使用代理和反爬虫机制
一设定程序停止一段时间
import time
time.sleep(n)#n是秒,sleep for n seconds
import random
time_interval = random .uniform(1,5)
time.sleep(time_interval)#时间也可以设置成随机停止,如例子所示为在1到5秒之间随机停止
二通过代理爬虫
#使用urllib.request的两个方法进行代理的设置
proxy = urlrequest.ProxyHandler({'https': '47.91.78.201:3128'})#代理服务器网络:www.sslproxiex.org
opener = urlrequest.build_opener(proxy)
opener.addheaders = [('User-Agent','...')]#网站是可以识别你是否在使用Python进行爬取,需要你在发送网络请求时,把header部分伪装成浏览器
'...'放入的是不同浏览器的header
三示例
import urllib.request as urlrequest
import time
import random
IMG_PATH = './imgs/{}.jpg'
DATA_FILE = './data/votes.csv'
STORED_IMG_ID_FILE = './data/cached_img.txt'
STORED_IMG_IDS = set()
IMG_URL = 'https://maps.googleapis.com/maps/api/streetview?size=400x300&location={},{}'
proxy = urlrequest.ProxyHandler({'https': '47.91.78.201:3128'})
opener = urlrequest.build_opener(proxy)
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30')]
urlrequest.install_opener(opener)
with open(STORED_IMG_ID_FILE) as input_file:
for line in input_file:
STORED_IMG_IDS.add(line.strip())
with open(DATA_FILE) as input_file:
skip_first_line = True
for line in input_file:
if skip_first_line:
skip_first_line = False
continue
left_id, right_id, winner, left_lat, left_long, right_lat, right_long, category = line.split(',')
if left_id not in STORED_IMG_IDS:
print ('saving img {}...'.format(left_id))
urlrequest.urlretrieve(IMG_URL.format(left_lat, left_long), IMG_PATH.format(left_id))
STORED_IMG_IDS.add(left_id)
with open(STORED_IMG_ID_FILE, 'a') as output_file:
output_file.write('{}\n'.format(left_id))
time.sleep(1) # wait some time, trying to avoid google forbidden (of crawler)
if right_id not in STORED_IMG_IDS:
print ('saving img {}...'.format(right_id))
urlrequest.urlretrieve(IMG_URL.format(right_lat, right_long), IMG_PATH.format(right_id))
STORED_IMG_IDS.add(right_id)
with open(STORED_IMG_ID_FILE, 'a') as output_file:
output_file.write('{}\n'.format(right_id))
time.sleep(1) # wait some time, trying to avoid google forbidden (of crawler)