使用正则表达式，取得点击次数，函数抽离

学会使用正则表达式

1. 用正则表达式判定邮箱是否输入正确。

import re

def validateEmail(email):
    if len(email) > 7:
        if re.match("^.+\\@(\\[?)[a-zA-Z0-9\\-\\.]+\\.([a-zA-Z]{2,3}|[0-9]{1,3})(\\]?)$", email) != None:
            print('邮箱输入正确');
        else:
            print("邮箱输入不正确")


validateEmail('[email protected]')

使用正则表达式，取得点击次数，函数抽离

2. 用正则表达式识别出全部电话号码。

text="0753 78750541"
m=re.findall(r"\(?0\d{2,3}[) -]?\d{7,8}",text)
if m:
    print(m[0])
else:
    print ("号码验证不成功")

使用正则表达式，取得点击次数，函数抽离

3. 用正则表达式进行英文分词。re.split('',news)

news='''Chinese President Xi Jinping delivers a keynote speech at the opening ceremony of the Boao Forum for Asia Annual Conference 2018 in Boao, south China's Hainan Province, April 10, 2018. (Xinhua/Li Xueren)
Chinese President Xi Jinping promised to further open up China to the world at the opening ceremony of the Boao Forum for Asia Annual Conference 2018 on Tuesday'''
print(re.split('[\s .,?]+',news));

使用正则表达式，取得点击次数，函数抽离

4. 使用正则表达式取得新闻编号

5. 生成点击次数的Request URL

6. 获取点击次数

7. 将456步骤定义成一个函数 def getClickCount(newsUrl):

8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):

import requests
import re

from bs4 import BeautifulSoup
from datetime import datetime

url = "http://news.gzcc.cn/html/xiaoyuanxinwen/"
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')


def getClickCount(newsUrl):
    newId = re.search('\_(.*).html', newsUrl).group(1).split('/')[-1]
    clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9172&modelid=80'
    rest = requests.get(clickUrl).text.split('.html')[-1].lstrip("('").rstrip("');")
    print("新闻编号：", newId)
    print("新闻点击次数URL：", clickUrl)
    print("新闻点击次数：", rest)

def getNewDetail(Url):
    for news in soup.select('li'):
        if len(news.select('.news-list-title'))>0:
            t1=news.select('.news-list-title')[0].text
            d1=news.select('.news-list-description')[0].text
            a1=news.select('a')[0].attrs['href']

            res = requests.get(a1)
            res.encoding = 'utf-8'
            soupd = BeautifulSoup(res.text, 'html.parser')
            c1=soupd.select('#content')[0].text
            info=soupd.select('.show-info')[0].text
            print("新闻标题：", t1)
            print("新闻链接：", a1)
            print("新闻详情：", c1)
            resd = requests.get(a1)
            resd.encoding = 'utf-8'
            soupd = BeautifulSoup(resd.text, 'html.parser')
            time = soupd.select('.show-info')[0].text[0:24].lstrip('发布时间:')
            dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
            print("新闻发布时间：", dt)

            author=info[info.find('作者'):].split()[0].lstrip('作者:')
            fromwhere = info[info.find('来源'):].split()[0].lstrip('来源:')
            photo = info[info.find('摄影'):].split()[0].lstrip('摄影:')

            print("新闻作者：", author)
            print("新闻来源：", fromwhere)
            print("新闻摄影：", photo)
            getClickCount(a1)

def getPage(url):
    return int(soup.select('.a1')[0].text.rstrip('条'))//10+1

def getlist(url):
    for i in soup.select('li'):
        if len(i.select('.news-list-title')) > 0:
            place = i.select('.news-list-info')[0].contents[1].text  # 获取来源
            title = i.select('.news-list-title')[0].text  # 获取标题
            description = i.select('.news-list-description')[0].text  # 获取描述
            detailurl = i.select('a')[0].attrs['href']  # 获取链接
            print("来源：" + place)
            print("新闻标题：" + title)
            print("新闻描述：" + description)
            print("新闻链接：" + detailurl)

def getall(url):
   for num in range(2,getPage(url)):
     listpageurl="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(num)
     getlist(listpageurl)
     getNewDetail(listpageurl)

getall(url)

使用正则表达式，取得点击次数，函数抽离

使用正则表达式，取得点击次数，函数抽离

相关推荐