# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html from scrapy.selector import Selector import scrapy import re import time import logging class ResoldApartmentItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() unit_price = scrapy.Field() total_price = scrapy.Field() orientation = scrapy.Field() area = scrapy.Field() built_year = scrapy.Field() property = scrapy.Field() decoration = scrapy.Field() model = scrapy.Field() floor = scrapy.Field() image = scrapy.Field() house_type = scrapy.Field() trading_ownership = scrapy.Field() tag = scrapy.Field() location = scrapy.Field() longitude = scrapy.Field() latitude = scrapy.Field() page_url = scrapy.Field() house_id = scrapy.Field() @classmethod def handle_response(cls, response): item = cls() item['title'] = response.xpath( '//ul[@class="house-basic-item3"]/li[1]/span[@class="c_000 mr_10"]/a[1]/text()').extract_first().strip() unit_price_string = response.xpath('//span[@class="unit"]/text()').extract_first() item['unit_price'] = re.search(r'[1-9][\d]*', unit_price_string).group() # 处理总价 total_price_string = response.xpath('//span[@class="price"]/text()').extract_first() total_price = re.search('[0-9]+(\.)?[0-9]*', total_price_string).group() price_unit = response.xpath(r'//span[@class="price"]/b/text()', ) if price_unit == '万': total_price = str(int(total_price) * 10000) elif price_unit == '千': total_price = str(int(total_price) * 1000) item['total_price'] = total_price item['orientation'] = response.xpath('//p[@class="toward"]/span[@class="main"]/text()').extract_first() # 建筑年代 built_year_str = response.xpath(r'//p[@class="toward"]/span[@class="sub"]/text()').extract_first() if built_year_str: item['built_year'] = re.search(r'[\d]*', built_year_str).group() area_string = response.xpath( '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first() item['area'] = re.search(r'[\d]+', area_string).group() item['property'] = response.xpath( '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[3]/span[2]/text()').re(r'[\d]+')[0] item['decoration'] = response.xpath( '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[2]/span[2]/text()').extract_first() item['model'] = response.xpath( '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first() item['floor'] = response.xpath( '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[1]/span[2]/text()').extract_first() item['house_type'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first() item['trading_ownership'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first() # 图片 item['image'] = response.xpath( r'//div[@class="basic-pic-list pr"]/ul[@id="leftImg"]/li[1]/img/@data-value').extract_first() # 位置 location_list = response.xpath(r'//ul[@class="house-basic-item3"]/li[2]/span[2]/a/text()').extract() if location_list: location_str = '-'.join(location_list) item['location'] = location_str # 标签 tag_list = response.xpath(r'//p[@class="house-update-info"]/span[@class="ts"]/text()').extract() if tag_list: item['tag'] = tag_list[0] # 经纬度 script_string = response.xpath(r'//script[@type="text/javascript"]').extract_first() latitude_math = re.search(r'"lat":([1-9])[\d](\.)[\d]*,"', script_string).group() longitude_math = re.search(r'"lon":[1-9][\d][\d](\.)[\d]*,"', script_string).group() item['latitude'] = re.search(r'([1-9])[\d](\.)[\d]*', latitude_math).group() item['longitude'] = re.search(r'[1-9][\d][\d](\.)[\d]*', longitude_math).group() page_url = response._url item['page_url'] = page_url item['house_id'] = '109' return item class CommunityItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() unit_price = scrapy.Field() floating_rate = scrapy.Field() built_year = scrapy.Field() location = scrapy.Field() page_url = scrapy.Field() type = scrapy.Field() house_id = scrapy.Field() @classmethod def handle_response(cls, response, type): item = cls() selector = Selector(text=response) item['title'] = selector.xpath(r'//h3/a/@title').extract_first() item['unit_price'] = selector.xpath(r'//div[@class="li-side"]/p/strong/text()').extract_first() floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt"]/text()').extract_first() if not floating_rate: floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt price-down"]/text()').extract_first() item['floating_rate'] = floating_rate item['location'] = selector.xpath(r'//div[@class="li-info"]/address/text()').extract_first().strip() item['page_url'] = selector.xpath(r'//div[@_soj="xqlb"]/@link').extract_first().strip() item['built_year'] = selector.xpath(r'//p[@class="date"]/text()').extract_first().strip() item['type'] = type item['house_id'] = '109' return item class FTXCommunityItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() unit_price = scrapy.Field() floating_rate = scrapy.Field() year_floating_tare = scrapy.Field() built_year = scrapy.Field() property = scrapy.Field() property_type = scrapy.Field() building_type = scrapy.Field() greening_rate = scrapy.Field() plot_ratio = scrapy.Field() total_area = scrapy.Field() building_area = scrapy.Field() construction = scrapy.Field() location = scrapy.Field() region = scrapy.Field() page_url = scrapy.Field() img_url = scrapy.Field() predict_type = scrapy.Field() house_id = scrapy.Field() @classmethod def handle_response(cls, response): item = cls() title_string: str = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract_first() if title_string: item['title'] = title_string.replace('小区网', '') item['unit_price'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[1]/dd/span/text()').extract_first() item['floating_rate'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[2]/dd/span/text()').extract_first() item['year_floating_tare'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[3]/dd/span/text()').extract_first() item['location'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="小区地址:"]/text()').extract_first() item['region'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="所属区域:"]/text()').extract_first() property_string = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="产权描述:"]/text()').extract_first() item['property'] = None if property_string: re_list = re.search(r'[\d]{1,2}', property_string) if re_list: item['property'] = re_list.group(0) item['property_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="物业类别:"]/text()').extract_first() if not item['property']: item['predict_type'] = '其他' elif item['property'] == '70': item['predict_type'] = '住宅' else: item['predict_type'] = '公寓' item['construction'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑结构:"]/span/text()').extract_first() item['built_year'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑年代:"]/text()').extract_first() item['building_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑类型:"]/text()').extract_first() item['greening_rate'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="绿 化 率:"]/text()').extract_first() item['plot_ratio'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="容 积 率:"]/text()').extract_first() item['total_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="占地面积:"]/text()').extract_first() item['building_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑面积:"]/text()').extract_first() item['img_url'] = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="logopic_sq"]/a/img/@src').extract_first() item['page_url'] = response._url item['house_id'] = '109' return item class RentalHouseItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() location = scrapy.Field() price = scrapy.Field() house_type = scrapy.Field() area = scrapy.Field() orientation = scrapy.Field() floor = scrapy.Field() decoration = scrapy.Field() property_type = scrapy.Field() house_code = scrapy.Field() publish_date = scrapy.Field() longitude = scrapy.Field() latitude = scrapy.Field() img_url = scrapy.Field() page_url = scrapy.Field() date = scrapy.Field() coordinate = scrapy.Field() house_id = scrapy.Field() @classmethod def handle_response(cls, response): item = cls() name_list: list = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="小区:"]/a/text()').extract() location_string = '' if name_list: if len(name_list) > 0: item['title'] = name_list[0] if len(name_list) > 1: location_string += name_list[1] location_string += "-" if len(name_list) > 2: location_string += name_list[2] item['location'] = location_string price_list = response.xpath(r'//li[@class="full-line cf"]/span[@class="price"]//text()').extract() if price_list: item['price'] = "".join(price_list) item['house_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="户型:"]/span[@class="info"]/text()').extract_first() item['area'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="面积:"]/span[@class="info"]/text()').extract_first() item['orientation'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="朝向:"]/span[@class="info"]/text()').extract_first() item['floor'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="楼层:"]/span[@class="info"]/text()').extract_first() item['decoration'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="装修:"]/span[@class="info"]/text()').extract_first() item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first() house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first() if house_info_string: code_match = re.search(r'[\d]{6,}', house_info_string) if code_match: item['house_code'] = code_match.group() date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_info_string) if date_match: item['publish_date'] = date_match.group() longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,') if longitude_match: item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group() latitude_match = Selector(response).re(r'lat:[\d]{0,2}[\.][\d]*,') if latitude_match: item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group() if longitude_match and latitude_match: try: item['coordinate'] = [float(item['longitude']), float(item['latitude'])] except Exception as err: logging.error('type conversion error ! reason: ' + '-'.join(err.args)) item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img[1]/@data-src').extract_first() item['page_url'] = response._url item['date'] = time.strftime('%Y-%m-%d %H:%M:%S') item['house_id'] = '109' return item class FTXRentalHouseItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() location = scrapy.Field() price = scrapy.Field() house_type = scrapy.Field() area = scrapy.Field() orientation = scrapy.Field() floor = scrapy.Field() decoration = scrapy.Field() # property_type = scrapy.Field() house_code = scrapy.Field() update_date = scrapy.Field() # longitude = scrapy.Field() # latitude = scrapy.Field() img_url = scrapy.Field() page_url = scrapy.Field() date = scrapy.Field() house_id = scrapy.Field() @classmethod def handle_response(cls, response): item = cls() house_info_match = Selector(response).re(r'var houseInfo = {[\s\S]*};') if house_info_match: info_str = house_info_match[0] title_match = re.search(r"projname: '[\s\S]*?',", info_str) if title_match: title_str = title_match[0] title_str = title_str.replace("projname: '", '') title_str = title_str.replace("',", '') item['title'] = title_str district_math = re.search(r"district: '[\s\S]*?',", info_str) location_string = '' if district_math: district_str = district_math[0] district_str = district_str.replace("district: '", '') district_str = district_str.replace("',", '') location_string += district_str location_string += '-' comarea_math = re.search(r"comarea: '[\s\S]*?',", info_str) if comarea_math: comarea_str = comarea_math[0] comarea_str = comarea_str.replace("comarea: '", '') comarea_str = comarea_str.replace("',", '') location_string += comarea_str item['location'] = location_string # name_list: list = response.xpath(r'//div[div[@class="lab"][text()="小      区"]]/div[contains(@class, "rcont")]/a/text()').extract() # location_string = '' # if name_list: # if len(name_list) > 0: # item['title'] = name_list[0] # if len(name_list) > 1: # location_string += name_list[1] # location_string += "-" # if len(name_list) > 2: # location_string += name_list[2] # item['location'] = location_string price_list = response.xpath(r'//div[@class ="tab-cont-right"]/div[@class ="tr-line clearfix zf_new_title"]/div[contains(@class, "trl-item sty1")]//text()').extract() if price_list: item['price'] = "".join(price_list).strip() item['house_type'] = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"]="户型"]/div[@class="tt"]/text()').extract_first() item['area'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="建筑面积"]/div[@class="tt"]/text()').extract_first() item['orientation'] = response.xpath(r'//div[@class="trl-item1 w146"][div[@class="font14"]="朝向"]/div[@class="tt"]/text()').extract_first() floor_list = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"][contains(text(), "楼层")]]/div//text()').extract() if floor_list: floor_str = '-'.join(floor_list) item['floor'] = floor_str item['decoration'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="装修"]/div[@class="tt"]/text()').extract_first() # item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first() # house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first() house_code_string = response.xpath(r'//span[contains(text(), "房源编号")]/text()').extract_first() if house_code_string: code_match = re.search(r'[\d]{6,}', house_code_string) if code_match: item['house_code'] = code_match.group() house_date_string = response.xpath(r'//span[contains(text(), "更新时间")]/text()').extract_first() if house_code_string: date_match = re.search(r'[\d]{0,4}-[\d]{0,2}-[\d]{0,2}', house_date_string) if date_match: item['update_date'] = date_match.group() # longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,') # if longitude_match: # item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group() # # latitude_match = Selector(response).re(r'[\d]{0,2}[\.][\d]*,') # if latitude_match: # item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group() item['img_url'] = response.xpath(r'//div[@class="bigImg"]/img[1]/@src').extract_first() item['page_url'] = response._url item['date'] = time.strftime('%Y-%m-%d %H:%M:%S') item['house_id'] = '109' return item class ResoldHouseItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() location = scrapy.Field() price = scrapy.Field() house_type = scrapy.Field() area = scrapy.Field() orientation = scrapy.Field() floor = scrapy.Field() decoration = scrapy.Field() property_type = scrapy.Field() total_price = scrapy.Field() down_payment = scrapy.Field() monthly_payment = scrapy.Field() house_code = scrapy.Field() publish_date = scrapy.Field() house_price_info = scrapy.Field() community_price_info = scrapy.Field() area_price_info = scrapy.Field() longitude = scrapy.Field() latitude = scrapy.Field() img_url = scrapy.Field() page_url = scrapy.Field() date = scrapy.Field() coordinate = scrapy.Field() house_id = scrapy.Field() build_year = scrapy.Field() @classmethod def handle_response(cls, response): item = cls() item['title'] = response.xpath(r'//li[div[text()="所属小区:"]]/div/a/text()').extract_first() location_list = response.xpath(r'//li[div[text()="所在位置:"]]/div/p//text()').extract() if location_list: location = ''.join(location_list).strip() location = location.replace('\n', '') location = location.replace('\t', '') item['location'] = location item['price'] = response.xpath(r'//li[div[text()="房屋单价:"]]/div[2]/text()').extract_first() house_type_string: str = response.xpath(r'normalize-space(//li[div[text()="房屋户型:"]]/div[2]/text())').extract_first() if house_type_string: item['house_type'] = house_type_string.replace(' ', '') item['area'] = response.xpath(r'//li[div[text()="建筑面积:"]]/div[2]/text()').extract_first() item['orientation'] = response.xpath(r'//li[div[text()="房屋朝向:"]]/div[2]/text()').extract_first() item['floor'] = response.xpath(r'normalize-space(//li[div[text()="所在楼层:"]]/div[2]/text())').extract_first() item['decoration'] = response.xpath(r'normalize-space(//li[div[text()="装修程度:"]]/div[2]/text())').extract_first() item['property_type'] = response.xpath(r'normalize-space(//li[div[text()="房屋类型:"]]/div[2]/text())').extract_first() item['total_price'] = response.xpath(r'//div[@class="wrapper"]/div[@class="wrapper-lf clearfix"]/div[@class="basic-info clearfix"]/span[1]/em/text()').extract_first() item['down_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考首付:"]]/div[2]/text())').extract_first() item['monthly_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考月供:"]]/div/span/text())').extract_first() item['build_year'] = response.xpath(r'normalize-space(//li[div[text()="建造年代:"]]/div[2]/text())').extract_first() house_code_string = response.xpath(r'//span[contains(text(), "房屋编码")]/text()').extract_first() if house_code_string: code_match = re.search(r'[\d]{6,}', house_code_string) if code_match: item['house_code'] = code_match.group() house_date_string = response.xpath(r'//span[contains(text(), "发布时间")]/text()').extract_first() if house_code_string: date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_date_string) if date_match: item['publish_date'] = date_match.group() longitude_match = Selector(response).re(r'lng : "[\d]{0,3}[\.][\d]*"') if longitude_match: item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group() latitude_match = Selector(response).re(r'lat : "[\d]{0,2}[\.][\d]*"') if latitude_match: item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group() if longitude_match and latitude_match: try: item['coordinate'] = [float(item['longitude']), float(item['latitude'])] except Exception as err: logging.error('type conversion error ! reason: ' + '-'.join(err.args)) item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img/@data-src').extract_first() item['page_url'] = response._url item['date'] = time.strftime('%Y-%m-%d %H:%M:%S') item['house_id'] = '109' return item class LfsAveragePriceItem(scrapy.Item): _id = scrapy.Field() title = scrapy.Field() price = scrapy.Field() arrow = scrapy.Field() rate = scrapy.Field() page_url = scrapy.Field() date = scrapy.Field() house_id = scrapy.Field() @classmethod def handle_response(cls, response): item = cls() item['title'] = response.xpath(r'//div[@class="comm-title"]/a/@title').extract_first() price_math = Selector(response).re(r'(?<="comm_midprice":")([0-9]*(?=","area_midprice))') if price_math: item['price'] = price_math[0] # item['arrow'] = response.xpath(r'//i[@class="arrow"]/text()').extract_first() # item['rate'] = response.xpath(r'normalize-space(//span[@class="status level"]/text())').extract_first() item['page_url'] = response._url item['date'] = time.strftime('%Y-%m-%d %H:%M:%S') item['house_id'] = '109' return item