R语言 rvest爬取大众点评上海商场信息(R语言&大数据分析qq群 456726635 欢迎讨论交流)
library(rvest)
library(xml2)
library(magrittr)
#点评官网
dianpingweb<-"http://www.dianping.com"
#综合商场
comshop<-"/search/category/1/20/g119p"
#
webii<-seq(1:22)
fulldata<-data.frame()
for(i in webii)
{
web<-read_html(paste0(dianpingweb,comshop,as.character(i)),encoding="UTF-8") #"http://www.dianping.com/search/category/1/20/g119"
#商城名字
businame<-web %>% html_nodes("div.tit")%>% html_nodes("h4")%>% html_text()
busitag<-web %>% html_nodes("div.tag-addr")%>% html_nodes("a")%>% html_nodes("span")%>% html_text()
temptag<-matrix(busitag,ncol=2,byrow=T)
#商城类别和商城地址
busicatog<-temptag[,1]
busiaddr<-temptag[,2]
#商城等级
busideg<-web %>% html_nodes("div.comment")%>% html_nodes("span")%>%html_attr("title")
#评论数
# reviewa<-web %>% html_nodes("div.comment")%>% html_nodes("a.review-num")%>%html_text()
reviewnum<-web %>% html_nodes("div.comment")%>% html_nodes("a.review-num b")%>%html_text()
# reviewa
#平均消费tempprice
nullprice<-web %>% html_nodes("a.mean-price b")%>%html_text()
nullprice<-gsub(pattern = "¥", replacement = "", nullprice)
pricea<-web %>% html_nodes("a.mean-price")%>%html_text()
jvector<-vector()
for(j in 1:length(pricea))
{
pricelist=unlist(strsplit(pricea[j],split=" "))
pricegrep<-grep(pattern = "\\¥", pricelist, value = TRUE)
if(identical(pricegrep, character(0)))
{
jvector[j]=0
}
else jvector[j]=1
}
meanprice=jvector
ii=1
for(k in 1:length(jvector))
{
if(jvector[k]==1)
{
meanprice[k]=nullprice[ii]
ii=ii+1
}
}
meanprice
#下一层连接url
childhtml<-web %>% html_nodes("div.pic")%>% html_nodes("a")%>%html_attr("href")
print(i)
library(xml2)
library(magrittr)
#点评官网
dianpingweb<-"http://www.dianping.com"
#综合商场
comshop<-"/search/category/1/20/g119p"
#
webii<-seq(1:22)
fulldata<-data.frame()
for(i in webii)
{
web<-read_html(paste0(dianpingweb,comshop,as.character(i)),encoding="UTF-8") #"http://www.dianping.com/search/category/1/20/g119"
#商城名字
businame<-web %>% html_nodes("div.tit")%>% html_nodes("h4")%>% html_text()
busitag<-web %>% html_nodes("div.tag-addr")%>% html_nodes("a")%>% html_nodes("span")%>% html_text()
temptag<-matrix(busitag,ncol=2,byrow=T)
#商城类别和商城地址
busicatog<-temptag[,1]
busiaddr<-temptag[,2]
#商城等级
busideg<-web %>% html_nodes("div.comment")%>% html_nodes("span")%>%html_attr("title")
#评论数
# reviewa<-web %>% html_nodes("div.comment")%>% html_nodes("a.review-num")%>%html_text()
reviewnum<-web %>% html_nodes("div.comment")%>% html_nodes("a.review-num b")%>%html_text()
# reviewa
#平均消费tempprice
nullprice<-web %>% html_nodes("a.mean-price b")%>%html_text()
nullprice<-gsub(pattern = "¥", replacement = "", nullprice)
pricea<-web %>% html_nodes("a.mean-price")%>%html_text()
jvector<-vector()
for(j in 1:length(pricea))
{
pricelist=unlist(strsplit(pricea[j],split=" "))
pricegrep<-grep(pattern = "\\¥", pricelist, value = TRUE)
if(identical(pricegrep, character(0)))
{
jvector[j]=0
}
else jvector[j]=1
}
meanprice=jvector
ii=1
for(k in 1:length(jvector))
{
if(jvector[k]==1)
{
meanprice[k]=nullprice[ii]
ii=ii+1
}
}
meanprice
#下一层连接url
childhtml<-web %>% html_nodes("div.pic")%>% html_nodes("a")%>%html_attr("href")
print(i)
}
图片中是爬取的数据样例。