爬虫任务一

【任务一】
1.1 学习get与post请求
学习get与post请求,尝试使用requests或者是urllib用get方法向https://www.baidu.com/发出一个请求,并将其返回结果输出。
如果是断开了网络,再发出申请,结果又是什么。了解申请返回的状态码。
了解什么是请求头,如何添加请求头。

在客户机和服务器之间进行请求-响应时,两种最常被用到的方法是:GET 和 POST。

GET - 从指定的资源请求数据。
POST - 向指定的资源提交要被处理的数据

get方法向https://www.baidu.com/发出一个请求:
爬虫任务一
断网情况下:

Traceback (most recent call last):
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\util\connection.py", line 57, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "H:\python3.7\lib\socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\connectionpool.py", line 354, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "H:\python3.7\lib\http\client.py", line 1229, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "H:\python3.7\lib\http\client.py", line 1275, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "H:\python3.7\lib\http\client.py", line 1224, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "H:\python3.7\lib\http\client.py", line 1016, in _send_output
    self.send(msg)
  File "H:\python3.7\lib\http\client.py", line 956, in send
    self.connect()
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\connection.py", line 181, in connect
    conn = self._new_conn()
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\connection.py", line 168, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x00000000030FF240>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\requests\adapters.py", line 449, in send
    timeout=timeout
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\connectionpool.py", line 638, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\urllib3\util\retry.py", line 398, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='www.baidu.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000000030FF240>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:/Users/Administrator/Downloads/python-基础笔记/test/test-python.py", line 13, in <module>
    print(requests.get('http://www.baidu.com'))
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\requests\api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\requests\api.py", line 60, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\requests\sessions.py", line 533, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\requests\sessions.py", line 646, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\Administrator\Downloads\sport-new\venv\lib\site-packages\requests\adapters.py", line 516, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='www.baidu.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000000030FF240>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed'))

返回状态码:
200 -请求成功、301 -资源被转移到其他url、404 -请求的资源不存在、500 -服务器的内部出错

1.2 正则表达式
学习什么是正则表达式并尝试一些正则表达式并进行匹配。

#!/usr/bin/python
# -*- coding: UTF-8 -*-


#coding=utf-8

# # 导入re模块
# import re
#
# # 使用match方法进行匹配操作
# result = re.match(正则表达式,要匹配的字符串)
#
# # 如果上一步匹配到数据的话,可以使用group方法来提取数据
# result.group()



# import re
#
# str = '<div class="nam">中国</div>'
# res = re.findall(r'<div class=".*">(.*?)</div>',str)
# print(res)


import re

# 如果hello的首字符小写,那么正则表达式需要小写的h
# ret = re.match("h","hello Python")
# print (ret.group())


# ret = re.match("[hH]","hello Python")
# print(ret.group())
# ret = re.match("[hH]","Hello Python")
# print(ret.group())

# ret = re.match("[0123456789]","7Hello Python")
# print(ret.group())

# ret = re.match("嫦娥\d号","嫦娥1号发射成功")
# print (ret.group())


# 匹配出,一个字符串第一个字母为大小字符,后面都是小写字母并且这些小写字母可有可无
# ret = re.match("[A-Z][a-z]*","Aabcdef")
# s = ret.group()
# print (s)


# 匹配出,变量名是否有效
# ret = re.match("[a-zA-Z_]+[\w_]*","_name_")
# print (ret.group())

import re

# names = ["name1", "_name", "2_name", "__name__"]
#
# for name in names:
#     ret = re.match("[a-zA-Z_]+[\w]*",name)
#     if ret:
#         print("变量名 %s 符合要求" % ret.group())
#     else:
#         print("变量名 %s 非法" % name)


# 匹配出,0到99之间的数字
# ret = re.match("[1-9]?[0-9]","90")
# print (ret.group())


# 匹配出,8到20位的密码,可以是大小写英文字母、数字、下划线
# ret = re.match("[a-zA-Z0-9_]{8}","12a3g45678")
# print(ret.group())

# ret = re.match("[a-zA-Z0-9_]{8,20}","1ad12f23s34455ff66")
# print (ret.group())

# 匹配出163的邮箱地址,且@符号之前有4到20位,例如[email protected]
# ret = re.match("[a-zA-Z0-9]{4,20}@163.com","[email protected]")
# print(ret.group())

# E_mail = "[email protected]"
# ret = re.math('[a-zA-Z0-9_]{4,20}@163.com',E_mail)
# print (ret.group())

# ret = re.match("[\w]{4,20}@163\.com", "[email protected]")
# print(ret.group())

# 通过$来确定末尾
# ret = re.match("[\w]{4,20}@163\.com$", "[email protected]")
# print(ret.group())

# \b 用法 匹配一个单词的边界
# ret = re.match(r".*\bver\b", "ho ver abc").group()
# print(ret)


# \B 	匹配非单词边界
# ret = re.match(r".*\Bver\B", "hoverabc").group()
# print(ret)

# 匹配出0-100之间的数字
# ret = re.match("[1-9]?\d","178")
# print (ret.group())

# 添加|
# ret = re.match("[1-9]?\d$|100","80")
# print (ret.group())


# 匹配出163、126、qq邮箱之间的数字
# ret = re.match("\w{4,20}@(163|126|qq)\.com", "[email protected]")
# print(ret.group())


# 练习:
# ret = re.match("([^-]*)-(\d+)","010-12345678")
# print(ret.group())
# print(ret.group(1))
# print(ret.group(2))


# 匹配出<html>hh</html>
# ret = re.match("<[a-zA-Z]*>\w*</[a-zA-Z]*>", "<html>hh</html>")
# print(ret.group())

# 匹配出<html><h1>www.itcast.cn</h1></html>
# ret = re.match(r"<(\w*)><(\w*)>.*</\2></\1>", "<html><h1>www.itcast.cn</h1></html>")
# print(ret.group())

# (?P<name>) (?P=name)
# ret = re.match(r"<(?P<name1>\w*)><(?P<name2>\w*)>.*</(?P=name2)></(?P=name1)>", "<html><h1>www.itcast.cn</h1></html>")
# print(ret.group())



# re模块的高级用法-------search
# 匹配出文章阅读的次数
# ret = re.search(r"\d+", "阅读次数为 9999")
# print(ret.group())

# findall
# 统计出python、c、c++相应文章阅读的次数
# ret = re.findall(r"\d+", "python = 9999, c = 7890, c++ = 12345")
# print (ret)


# sub 将匹配到的数据进行替换
# 需求:将匹配到的阅读次数加1

# 方法1:
# ret = re.sub(r"\d+", '998', "python = 997")
# print (ret)

# 方法2:
# def add(temp):
#     strNum = temp.group()
#     num = int(strNum) + 1
#     return str(num)

# ret = re.sub(r"\d+", add, "python = 997")
# print (ret)
#
# ret = re.sub(r"\d+", add, "python = 99")
# print (ret)


# split 根据匹配进行切割字符串,并返回一个列表
# 需求:切割字符串“info:xiaoZhang 33 shandong”
# ret = re.split(r":| ","info:xiaoZhang 33 shandong")
# print (ret)
#

# 在"*","?","+","{m,n}"后面加上?,使贪婪变成非贪婪。
# s="This is a number 234-235-22-423"
# r=re.match(".+(\d+-\d+-\d+-\d+)",s)
# print(r.group(1))
#
# r=re.match(".+?(\d+-\d+-\d+-\d+)",s)
# print(r.group(1))


# 非贪婪操作符“?”,这个操作符可以用在"*","+","?"的后面,要求正则匹配的越少越好。
re.match(r"aa(\d+)","aa2343ddd").group(1)
# '2343'
re.match(r"aa(\d+?)","aa2343ddd").group(1)
# '2'
re.match(r"aa(\d+)ddd","aa2343ddd").group(1)
# '2343'
re.match(r"aa(\d+?)ddd","aa2343ddd").group(1)
# '2343'

然后结合requests、re两者的内容爬取https://movie.douban.com/top250里的内容
要求抓取名次、影片名称、年份、导演等字?段。
参考资料: https://desmonday.github.io/2019/03/02/python爬虫学习-day2正则表达式/

import requests
import re
import csv

# https://blog.****.net/bmjhappy/article/details/80512917 中文字符串匹配
def movie_info(url):
    headers = {
        'User-Agent':"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
    }

    res = requests.get(url, headers=headers)
    ranks = re.findall(' <em class="">(.*?)</em>',res.text, re.S)
    names = re.findall('<span class="title">([\u4e00-\u9fa5]+)</span>',res.text, re.S)
    countries = re.findall('&nbsp;/&nbsp;([\u4e00-\u9fa5]+)&nbsp;/&nbsp;', res.text, re.S)
    text = re.sub('导演: ',"",res.text)  # :中文标点符号
    directors = re.findall('<p class="">(.*?)&nbsp;&nbsp;', text, re.S)
    scores = re.findall('<span class="rating_num" property="v:average">(.*?)</span>',res.text,re.S)

    for rank,name,country,director,score in zip(ranks,names,countries,directors,scores):
        writer.writerow([rank,name,country,director,score])


if __name__ == '__main__':

    file = open('C:/movie.csv','w+',encoding='utf-8',newline='')
    writer = csv.writer(file)
    writer.writerow(['rank','name','country','director','score'])

    for i in range(0,250,25):
        url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
        movie_info(url)

爬虫任务一