2000字范文,分享全网优秀范文,学习好帮手!
2000字范文 > 房天下网站二手房爬虫 数据清洗及可视化(python)

房天下网站二手房爬虫 数据清洗及可视化(python)

时间:2019-10-21 02:40:52

相关推荐

房天下网站二手房爬虫 数据清洗及可视化(python)

房天下网站二手房爬虫、数据清洗及可视化(python)

爬虫代码

###爬取完的数据存入MangoDB中,需自行下载MangoDB

import requests, json, threadingfrom bs4 import BeautifulSoupimport numpy as npimport refrom tqdm import trangefrom pymongo import MongoClient as Clientcity = ['北京', '上海', '广东', '深圳', '沈阳', '大连']e_city = ['beijing', 'shanghai', 'guangdong', 'shenzheng', 'shenyang', 'dalian']eshouse = ['/house/i3{}/', 'https://sh./house/i3{}/','https://gz./house/i3{}/', 'https://sz./house/i3{}/','https://sy./house/i3{}/', 'https://dl./house/i3{}/']header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ''AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/64.0.3282.186 Safari/537.36'}is_craw = {}proxies = []def write_to_mongo(ips, city):'''将数据写入mongoDB'''client = Client(host='localhost', port=27017)db = client['fs_db']coll = db[city + '_good']for ip in ips:coll.insert_one({'name': ip[0], \'price': ip[1],'addresses': ip[2],'areas': ip[3],'eq': ip[4]})client.close()def read_from_mongo(city):client = Client(host='localhost', port=27017)db = client['fs_db']coll = db[city + '_good']li = coll.find()client.close()return liclass Consumer(threading.Thread):def __init__(self, args):threading.Thread.__init__(self, args=args)def run(self):global is_crawurl_demo, i, city_id, lock = self._argsprint("{}, 第{}页".format(city[city_id], i))url = url_demo.format(i)soup = get_real(url)names = []for name in soup.select('.tit_shop'):names.append(name.text.strip())addresses = []for item in soup.find_all('p', attrs={'class': 'add_shop'}):address = item.a.text + " " + item.span.textaddresses.append(address.replace('\t', '').replace('\n', ''))es = []for item in soup.find_all('p', attrs={'class': 'tel_shop'}):es.append(item.text.replace('\t', '').replace('\n', ''))moneys = []for money in soup.find_all("span", attrs={"class": 'red'}):moneys.append(money.text.strip())areas = []for area in soup.find_all('dd', attrs={'class': 'price_right'}):areas.append(area.find_all('span')[-1].text)houses = []for idx in range(len(names)):try:item = [names[idx], moneys[idx], addresses[idx], areas[idx], es[idx]]print(item)houses.append(item)except Exception as e:print(e)lock.acquire()write_to_mongo(houses, e_city[city_id])lock.release()print("线程结束{}".format(i))def dict2proxy(dic):s = dic['type'] + '://' + dic['ip'] + ':' + str(dic['port'])return {'http': s, 'https': s}def get_real(url):resp = requests.get(url, headers=header)soup = BeautifulSoup(resp.content, 'html.parser', from_encoding='gb18030')if soup.find('title').text.strip() == '跳转...':pattern1 = pile(r"var t4='(.*?)';")script = soup.find("script", text=pattern1)t4 = pattern1.search(str(script)).group(1)pattern1 = pile(r"var t3='(.*?)';")script = soup.find("script", text=pattern1)t3 = re.findall(pattern1, str(script))[-2]url = t4 + '?' + t3HTML = requests.get(url, headers=header)soup = BeautifulSoup(HTML.content, 'html.parser', from_encoding='gb18030')elif soup.find('title').text.strip() == '访问验证-房天下':passreturn soupdef read_proxies():client = Client(host='localhost', port=27017)db = client['proxies_db']coll = db['proxies']# 先检测,再写入,防止重复dic = list(coll.find())client.close()return dicdef craw():lock = threading.Lock()for idx in trange(len(e_city)):url = eshouse[idx]soup = get_real(url.format(2))try:page_number = int(soup.find('div', attrs={'class': 'page_al'}).find_all('span')[-1].text[1:-1])pages = list(range(1, page_number + 1))except:pages = list(range(1, 101))url_demo = urlts = []# pages = [1, 2, 3]while len(pages) != 0:for i in range(10):t = Consumer((url_demo, pages.pop(), idx, lock))t.start()ts.append(t)if len(pages) == 0:breakfor t in ts:t.join()ts.remove(t)if __name__ == '__main__':craw()

数据清洗代码

import csvimport pandas as pdtry:df = pd.read_csv(r'shenyang_good.csv', encoding='utf-8')except:df = pd.read_csv(r'shenyang_good.csv', encoding='gbk')df.drop_duplicates(subset=None, keep='first', inplace=True)out = open('清洗后城市二手房数据.csv', 'a', newline='')csv_write = csv.writer(out, dialect='excel')# csv_write.writerow(['城市', '名称', '地址', '单价', '总价', '规格',# '面积', '层段', '总层数', '朝向', '建房时期', '介绍'])old_addresses = []old_areas = []old_eq = []old_name = []old_price = []for index in df['addresses']:old_addresses.append(index)for index in df['areas']:old_areas.append(index)for index in df['eq']:old_eq.append(index)for index in df['name']:old_name.append(index)for index in df['price']:old_price.append(index)for index in range(len(old_addresses)):try:city = '沈阳'name, address = old_addresses[index].split(' ')yuan = old_areas[index][:-3:]price = old_price[index][:-1:]intro = old_name[index]specs, rate, totals, toward, buildTime, people = old_eq[index].split('|')rate = rate[:-1:]layer, total = totals.split('(')total = total[1:-2:]buildTime = buildTime[:4:]csv_write.writerow([city, name, address, yuan, price, specs, rate,layer, total, toward, buildTime, intro])except:continueprint('清洗完成!')

可视化代码

##1.堆叠柱状图代码

from pyecharts import options as optsfrom pyecharts.charts import Barfrom mons.utils import JsCodefrom pyecharts.globals import ThemeTypeimport pandas as pdtry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')city_lst = ['北京', '上海', '广州', '深圳', '沈阳', '大连'] # 城市city = [] # 城市buildTime = [] # 建房时期new_buildTime = [[[], []] for i in range(6)]data1 = []data2 = []for index in df['城市']:city.append(index)for index in df['建房时期']:buildTime.append(index)for index in range(len(city)):for num in range(len(city_lst)):if city[index] == city_lst[num]:if int(buildTime[index]) >= :new_buildTime[num][1].append(buildTime[index])else:new_buildTime[num][0].append(buildTime[index])for g in range(len(new_buildTime)):value1 = len(new_buildTime[g][0])value2 = len(new_buildTime[g][1])result1 = {'value': value1, 'percent': '%.2f' % (value1/(value1+value2))}result2 = {'value': value2, 'percent': '%.2f' % (value2/(value1+value2))}data1.append(result1)data2.append(result2)print(data1)print(data2)c = (Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT)).add_xaxis(city_lst).add_yaxis("以前建房", data1, stack="stack1", category_gap="50%").add_yaxis("以后建房", data2, stack="stack1", category_gap="50%").set_series_opts(label_opts=opts.LabelOpts(position="right",formatter=JsCode("function(x){return Number(x.data.percent * 100).toFixed() + '%';}"),)).render("不同城市前后建房数量-堆叠柱状图.html"))

##效果图

##2. 3D柱状图代码

from pyecharts.charts import Pie, Bar, Line, Scatter3D, Bar3Dfrom tqdm import trangefrom pymongo import MongoClient as Clientimport csvfrom pyecharts import options as optsdef draw():with open("清洗后城市二手房数据.csv", 'r') as f:reader = csv.reader(f)data = list(reader)Bar3D().add(series_name="",data=[[item[0], item[4], item[6]]for item in data],xaxis3d_opts=opts.Axis3DOpts(name='城市',type_='category'),yaxis3d_opts=opts.Axis3DOpts(name='每平方米价格'),zaxis3d_opts=opts.Axis3DOpts(name='面积')).render("不同城市价格对应房屋规格3D.html")if __name__ == '__main__':draw()

##效果图

##3.折线图代码

import pyecharts.options as optsfrom pyecharts.charts import Lineimport pandas as pdtry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')city_lst = ['北京', '上海', '广州', '深圳', '沈阳', '大连'] # 城市city = [] # 城市price = [] # 建房时期new_buildTime = [[[], [],[],[],[],[]] for i in range(6)]data1 = []data2 = []data3 = []data4 = []data5 = []data6 = []for index in df['城市']:city.append(index)for index in df['总价']:price.append(index)for index in range(len(city)):for num in range(len(city_lst)):if city[index] == city_lst[num]:if str(city[index]) == "北京":new_buildTime[num][0].append(price[index])elif str(city[index]) == "上海":new_buildTime[num][1].append(price[index])elif str(city[index]) == "广东":new_buildTime[num][2].append(price[index])elif str(city[index]) == "深圳":new_buildTime[num][3].append(price[index])elif str(city[index]) == "沈阳":new_buildTime[num][4].append(price[index])elif str(city[index]) == "大连":new_buildTime[num][5].append(price[index])for g in range(len(new_buildTime)):data1.append(new_buildTime[g][0])data2.append(new_buildTime[g][1])data3.append(new_buildTime[g][2])data4.append(new_buildTime[g][3])data5.append(new_buildTime[g][4])data6.append(new_buildTime[g][5])x_data = ["0-100万", "100-200万", "200-300万", "300-400万", "400-500万", "500-600万",'600-700万','700-800万','800-900万','900-1000万','1000-2000万'](Line().add_xaxis(xaxis_data=x_data).add_yaxis(series_name="北京",stack="总量",y_axis=new_buildTime[0][0],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="上海",stack="总量",y_axis=new_buildTime[1][1],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="广东",stack="总量",y_axis=new_buildTime[2][2],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="深圳",stack="总量",y_axis=new_buildTime[3][3],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="沈阳",stack="总量",y_axis=new_buildTime[4][4],label_opts=opts.LabelOpts(is_show=False),).add_yaxis(series_name="大连",stack="总量",y_axis=new_buildTime[5][5],label_opts=opts.LabelOpts(is_show=False),).set_global_opts(title_opts=opts.TitleOpts(title="折线图堆叠"),tooltip_opts=opts.TooltipOpts(trigger="axis"),yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}"),axistick_opts=opts.AxisTickOpts(is_show=True),splitline_opts=opts.SplitLineOpts(is_show=True),),xaxis_opts=opts.AxisOpts(type_="category", boundary_gap=False),).render("不同城市房价对比-折线图.html"))

##效果图

##4.多维散点图代码

from pyecharts import options as optsfrom pyecharts.charts import Scatterfrom mons.utils import JsCodefrom pyecharts.faker import Fakerimport pandas as pdimport pyecharts as pyectry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')city_lst = ['北京', '上海', '广州', '深圳', '沈阳', '大连'] # 城市city = [] # 城市guige = [] # 规格area = [] #面积new_guige = [[[],[],[],[],[],[]] for i in range(6)]new_area = [[[],[],[],[],[],[]] for i in range(6)]for index in df['城市']:city.append(index)for index in df['规格']:guige.append(index)for index in df['面积']:area.append(index)for index in range(len(city)):for num in range(len(city_lst)):if city[index] == city_lst[num]:if str(city[index]) == "北京":new_guige[num][0].append(guige[index])new_area[num][0].append(area[index])elif str(city[index]) == "上海":new_guige[num][1].append(guige[index])new_area[num][1].append(area[index])elif str(city[index]) == "广州":new_guige[num][2].append(guige[index])new_area[num][2].append(area[index])elif str(city[index]) == "深圳":new_guige[num][3].append(guige[index])new_area[num][3].append(area[index])elif str(city[index]) == "沈阳":new_guige[num][4].append(guige[index])new_area[num][4].append(area[index])elif str(city[index]) == "大连":new_guige[num][5].append(guige[index])new_area[num][5].append(area[index])y_data = ["1室1厅", "1室0厅", "1室2厅", "1室3厅", "1室4厅", "2室0厅",'2室1厅','2室2厅','2室3厅','2室4厅',"3室0厅",'3室1厅','3室2厅','3室3厅','3室4厅',"4室0厅",'4室1厅','4室2厅','4室3厅','4室4厅',"5室0厅",'5室1厅','5室2厅','5室3厅','5室4厅',"6室0厅",'6室1厅','6室2厅','6室3厅','6室4厅']x_data = ['0-50万','50-100万','100-150万','150-200万','200-250万','250-300万','300-350万','350-400万','400-450万','450-500万','500-550万','550-600万']c = (Scatter().add_xaxis(x_data).add_yaxis("北京",[list(z) for z in zip(new_area[0][0],new_guige[0][0])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}") #params.value[1]对应y轴Faker.values() : params.value[2]对应y轴Faker.choose()),).add_yaxis("上海",[list(z) for z in zip(new_area[1][1],new_guige[1][1])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}") #params.value[1]对应y轴Faker.values() : params.value[2]对应y轴Faker.choose()),).add_yaxis("广州",[list(z) for z in zip(new_area[2][2],new_guige[2][2])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}") #params.value[1]对应y轴Faker.values() : params.value[2]对应y轴Faker.choose()),).add_yaxis("深圳",[list(z) for z in zip(new_area[3][3],new_guige[3][3])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}") #params.value[1]对应y轴Faker.values() : params.value[2]对应y轴Faker.choose()),).add_yaxis("沈阳",[list(z) for z in zip(new_area[4][4],new_guige[4][4])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}") #params.value[1]对应y轴Faker.values() : params.value[2]对应y轴Faker.choose()),).add_yaxis("大连",[list(z) for z in zip(new_area[5][5],new_guige[5][5])],# 标记的大小symbol_size=20,#标签配置项label_opts=opts.LabelOpts(formatter=JsCode(# 构造回调函数"function(params){return params.value[1] +' : '+ params.value[2];}") #params.value[1]对应y轴Faker.values() : params.value[2]对应y轴Faker.choose()),).set_global_opts(title_opts=opts.TitleOpts(title="Scatter-多维度数据"),# 提示框配置项tooltip_opts=opts.TooltipOpts(formatter=JsCode(# 构造回调函数"function (params) {return params.name + ' : ' + params.value[2];}") #params.name对应x轴的Faker.choose() : params.value[2]对应y轴Faker.choose()),# 视觉映射配置项visualmap_opts=opts.VisualMapOpts(#颜色映射type_="color",max_=150,min_=20,dimension=1 ## 组件映射维度),).render("不同城市价格对应房屋规格-多维散点图.html"))

##效果图

##5.漏斗图代码

import pandas as pdfrom pyecharts import options as optsfrom pyecharts.charts import Funneltry:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='utf-8')except:df = pd.read_csv(r'清洗后城市二手房数据.csv', encoding='gbk')df.drop_duplicates(subset=None, keep='first', inplace=True)city = []unit_price = []for index in df['城市']:city.append(index)for index in df['单价']:unit_price.append(index)last_city = list(set(city))new_unit_price = [[] for i in range(len(last_city))]last_unit_price = []for i in range(len(unit_price)):for j in range(len(last_city)):if city[i] == last_city[j]:new_unit_price[j].append(int(unit_price[i]))for index in new_unit_price:result = sum(index)/len(index)last_unit_price.append('%.2f' % result)c = (Funnel().add("平均单价",[list(z) for z in zip(last_city, last_unit_price)],sort_="ascending",label_opts=opts.LabelOpts(position="inside"),).set_global_opts(title_opts=opts.TitleOpts(title="不同城市平均单位房价(元/m^2)")).render("不同城市的平均单位房价-漏斗图.html"))

##效果图

##6.饼状图代码

import pandas as pddf=pd.read_csv(r'D:\清洗后城市二手房数据.csv',encoding='gbk')df.drop(df[df['层段'].str.contains('层段')].index, inplace=True)data_heigh=df['层段']lst_height_value=data_heigh.value_counts().keys().tolist()lst_counts=data_heigh.value_counts().tolist()import pyecharts.options as optsfrom pyecharts.charts import Piedef get_pie():x_data = lst_height_valuey_data = lst_countsdata_pair = [list(z) for z in zip(x_data, y_data)]data_pair.sort(key=lambda x: x[1])c=(Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c")).add(series_name="层段信息",data_pair=data_pair,rosetype="radius",radius="55%",center=["50%", "50%"],label_opts=opts.LabelOpts(is_show=False, position="center"),).set_global_opts(title_opts=opts.TitleOpts(title="Customized Pie",pos_left="center",pos_top="20",title_textstyle_opts=opts.TextStyleOpts(color="#fff"),),legend_opts=opts.LegendOpts(is_show=False),).set_series_opts(tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"),label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"),))return cif __name__ == '__main__':get_pie().render('所有城市楼盘的层段分析-饼状图.html')

##效果图

数据集

链接:链接:/s/1qN00fxxo9wi1tF5wiqj4ZQ

提取码:uvy3

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。