使用正则表达式,取得点击次数,函数抽离
学会使用正则表达式
1. 用正则表达式判定邮箱是否输入正确。
import re def validateEmail(email): if len(email) > 7: if re.match("^.+\\@(\\[?)[a-zA-Z0-9\\-\\.]+\\.([a-zA-Z]{2,3}|[0-9]{1,3})(\\]?)$", email) != None: print('邮箱输入正确'); else: print("邮箱输入不正确") validateEmail('[email protected]')
2. 用正则表达式识别出全部电话号码。
text="0753 78750541" m=re.findall(r"\(?0\d{2,3}[) -]?\d{7,8}",text) if m: print(m[0]) else: print ("号码验证不成功")
3. 用正则表达式进行英文分词。re.split('',news)
news='''Chinese President Xi Jinping delivers a keynote speech at the opening ceremony of the Boao Forum for Asia Annual Conference 2018 in Boao, south China's Hainan Province, April 10, 2018. (Xinhua/Li Xueren) Chinese President Xi Jinping promised to further open up China to the world at the opening ceremony of the Boao Forum for Asia Annual Conference 2018 on Tuesday''' print(re.split('[\s .,?]+',news));
4. 使用正则表达式取得新闻编号
5. 生成点击次数的Request URL
6. 获取点击次数
7. 将456步骤定义成一个函数 def getClickCount(newsUrl):
8. 将获取新闻详情的代码定义成一个函数 def getNewDetail(newsUrl):
import requests import re from bs4 import BeautifulSoup from datetime import datetime url = "http://news.gzcc.cn/html/xiaoyuanxinwen/" res = requests.get(url) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') def getClickCount(newsUrl): newId = re.search('\_(.*).html', newsUrl).group(1).split('/')[-1] clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id=9172&modelid=80' rest = requests.get(clickUrl).text.split('.html')[-1].lstrip("('").rstrip("');") print("新闻编号:", newId) print("新闻点击次数URL:", clickUrl) print("新闻点击次数:", rest) def getNewDetail(Url): for news in soup.select('li'): if len(news.select('.news-list-title'))>0: t1=news.select('.news-list-title')[0].text d1=news.select('.news-list-description')[0].text a1=news.select('a')[0].attrs['href'] res = requests.get(a1) res.encoding = 'utf-8' soupd = BeautifulSoup(res.text, 'html.parser') c1=soupd.select('#content')[0].text info=soupd.select('.show-info')[0].text print("新闻标题:", t1) print("新闻链接:", a1) print("新闻详情:", c1) resd = requests.get(a1) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') time = soupd.select('.show-info')[0].text[0:24].lstrip('发布时间:') dt = datetime.strptime(time, '%Y-%m-%d %H:%M:%S') print("新闻发布时间:", dt) author=info[info.find('作者'):].split()[0].lstrip('作者:') fromwhere = info[info.find('来源'):].split()[0].lstrip('来源:') photo = info[info.find('摄影'):].split()[0].lstrip('摄影:') print("新闻作者:", author) print("新闻来源:", fromwhere) print("新闻摄影:", photo) getClickCount(a1) def getPage(url): return int(soup.select('.a1')[0].text.rstrip('条'))//10+1 def getlist(url): for i in soup.select('li'): if len(i.select('.news-list-title')) > 0: place = i.select('.news-list-info')[0].contents[1].text # 获取来源 title = i.select('.news-list-title')[0].text # 获取标题 description = i.select('.news-list-description')[0].text # 获取描述 detailurl = i.select('a')[0].attrs['href'] # 获取链接 print("来源:" + place) print("新闻标题:" + title) print("新闻描述:" + description) print("新闻链接:" + detailurl) def getall(url): for num in range(2,getPage(url)): listpageurl="http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(num) getlist(listpageurl) getNewDetail(listpageurl) getall(url)