import requests
from bs4 import BeautifulSoup
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def GetUrlHtml(url):
kv={"User-Agent": "Mozilla/5.0"}
response=requests.get(url,headers=kv)
try:
response.raise_for_status()
response.status_code=response.apparent_encoding
return response.text
except:
return "URL异常了"
def HtmlParser(response):
soup=BeautifulSoup(response,"lxml")
#############提取房源名称
resblock_name=soup.find_all("div",class_="resblock-name")
# 使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析
for a in resblock_name:
#获取楼盘名字
name=a.text.split("\n")[1]
loupan_name.append(name)
#获取楼盘类型
type=a.text.split("\n")[2]
resblock_type.append(type)
#获取楼盘状态
status=a.text.split("\n")[3]
sale_status.append(status)
################提取房源位置
loupan_location=soup.find_all("div",class_="resblock-location")
for a in loupan_location:
location1=a.text.split("\n")[1]
location2 = a.text.split("\n")[3]
location3 = a.text.split("\n")[5]
location=location1+"/"+location2+"/"+location3
#print(location)
resblock_location.append(location)
###########获取房源均价
loupan_price=soup.find_all("div",class_="main-price")
for a in loupan_price:
price1=a.text.split("\n")[1]
price2=a.text.split("\n")[2]
price=price1+price2
#print(price)
main_price.append(price1)
def plot(house):
name=house["resblock_name"]
price = house["main_price"]
price = np.array(price)
name=np.array(name)
#添加横纵轴名称
plt.rc('font', family='STXihei', size=11)
plt.xlabel("楼盘名称")
plt.ylabel("楼盘价格")
#设置图例
plt.legend(["价格"],loc="upper right")
plt.plot(name,price)
plt.show()
if __name__ == '__main__':
url="https://jn.fang.lianjia.com/loupan/"
loupan_name = []
resblock_type = []
sale_status = []
resblock_location=[]
main_price=[]
for i in range(1,22):
#将url转化为字符串
i=str(i)
url=url+"pg"+i+"/"
#print(url)
response = GetUrlHtml(url)
HtmlParser(response)
#str.strip()过滤
house = pd.DataFrame({"resblock_name": loupan_name,"main_price":main_price,"resblock_location":resblock_location, " resblock_type": resblock_type, "sale_status": sale_status})
#调整列的顺序
house=house[["resblock_name","main_price","resblock_location"," resblock_type", "sale_status"]]
print(house)
if not os.path.exists("济南房价数据"):
os.mkdir("济南房价数据")
house.to_csv("济南房价数据/房价.csv",encoding = 'gbk', index = False)
#plot(house)
# 价格进行降序
# 删除价格待定的行
house = house[~house['main_price'].str.contains("价格待定")]
house=house.sort_values(by="main_price",ascending=False)
实验结果,这是按照从高到低出售排序,去除
