|
@@ -0,0 +1,496 @@
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+
|
|
|
|
+# Define here the models for your scraped items
|
|
|
|
+#
|
|
|
|
+# See documentation in:
|
|
|
|
+# https://doc.scrapy.org/en/latest/topics/items.html
|
|
|
|
+
|
|
|
|
+from scrapy.selector import Selector
|
|
|
|
+import scrapy
|
|
|
|
+import re
|
|
|
|
+import time
|
|
|
|
+import logging
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ResoldApartmentItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ unit_price = scrapy.Field()
|
|
|
|
+ total_price = scrapy.Field()
|
|
|
|
+ orientation = scrapy.Field()
|
|
|
|
+ area = scrapy.Field()
|
|
|
|
+ built_year = scrapy.Field()
|
|
|
|
+ property = scrapy.Field()
|
|
|
|
+ decoration = scrapy.Field()
|
|
|
|
+ model = scrapy.Field()
|
|
|
|
+ floor = scrapy.Field()
|
|
|
|
+ image = scrapy.Field()
|
|
|
|
+ house_type = scrapy.Field()
|
|
|
|
+ trading_ownership = scrapy.Field()
|
|
|
|
+ tag = scrapy.Field()
|
|
|
|
+ location = scrapy.Field()
|
|
|
|
+ longitude = scrapy.Field()
|
|
|
|
+ latitude = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response):
|
|
|
|
+ item = cls()
|
|
|
|
+ item['title'] = response.xpath(
|
|
|
|
+ '//ul[@class="house-basic-item3"]/li[1]/span[@class="c_000 mr_10"]/a[1]/text()').extract_first().strip()
|
|
|
|
+
|
|
|
|
+ unit_price_string = response.xpath('//span[@class="unit"]/text()').extract_first()
|
|
|
|
+ item['unit_price'] = re.search(r'[1-9][\d]*', unit_price_string).group()
|
|
|
|
+
|
|
|
|
+ # 处理总价
|
|
|
|
+ total_price_string = response.xpath('//span[@class="price"]/text()').extract_first()
|
|
|
|
+ total_price = re.search('[0-9]+(\.)?[0-9]*', total_price_string).group()
|
|
|
|
+ price_unit = response.xpath(r'//span[@class="price"]/b/text()', )
|
|
|
|
+ if price_unit == '万':
|
|
|
|
+ total_price = str(int(total_price) * 10000)
|
|
|
|
+ elif price_unit == '千':
|
|
|
|
+ total_price = str(int(total_price) * 1000)
|
|
|
|
+ item['total_price'] = total_price
|
|
|
|
+
|
|
|
|
+ item['orientation'] = response.xpath('//p[@class="toward"]/span[@class="main"]/text()').extract_first()
|
|
|
|
+
|
|
|
|
+ # 建筑年代
|
|
|
|
+ built_year_str = response.xpath(r'//p[@class="toward"]/span[@class="sub"]/text()').extract_first()
|
|
|
|
+ if built_year_str:
|
|
|
|
+ item['built_year'] = re.search(r'[\d]*', built_year_str).group()
|
|
|
|
+
|
|
|
|
+ area_string = response.xpath(
|
|
|
|
+ '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first()
|
|
|
|
+ item['area'] = re.search(r'[\d]+', area_string).group()
|
|
|
|
+
|
|
|
|
+ item['property'] = response.xpath(
|
|
|
|
+ '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[3]/span[2]/text()').re(r'[\d]+')[0]
|
|
|
|
+ item['decoration'] = response.xpath(
|
|
|
|
+ '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[2]/span[2]/text()').extract_first()
|
|
|
|
+ item['model'] = response.xpath(
|
|
|
|
+ '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first()
|
|
|
|
+ item['floor'] = response.xpath(
|
|
|
|
+ '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[1]/span[2]/text()').extract_first()
|
|
|
|
+ item['house_type'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first()
|
|
|
|
+ item['trading_ownership'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first()
|
|
|
|
+ # 图片
|
|
|
|
+ item['image'] = response.xpath(
|
|
|
|
+ r'//div[@class="basic-pic-list pr"]/ul[@id="leftImg"]/li[1]/img/@data-value').extract_first()
|
|
|
|
+
|
|
|
|
+ # 位置
|
|
|
|
+ location_list = response.xpath(r'//ul[@class="house-basic-item3"]/li[2]/span[2]/a/text()').extract()
|
|
|
|
+ if location_list:
|
|
|
|
+ location_str = '-'.join(location_list)
|
|
|
|
+ item['location'] = location_str
|
|
|
|
+
|
|
|
|
+ # 标签
|
|
|
|
+ tag_list = response.xpath(r'//p[@class="house-update-info"]/span[@class="ts"]/text()').extract()
|
|
|
|
+ if tag_list:
|
|
|
|
+ item['tag'] = tag_list[0]
|
|
|
|
+
|
|
|
|
+ # 经纬度
|
|
|
|
+ script_string = response.xpath(r'//script[@type="text/javascript"]').extract_first()
|
|
|
|
+ latitude_math = re.search(r'"lat":([1-9])[\d](\.)[\d]*,"', script_string).group()
|
|
|
|
+ longitude_math = re.search(r'"lon":[1-9][\d][\d](\.)[\d]*,"', script_string).group()
|
|
|
|
+ item['latitude'] = re.search(r'([1-9])[\d](\.)[\d]*', latitude_math).group()
|
|
|
|
+ item['longitude'] = re.search(r'[1-9][\d][\d](\.)[\d]*', longitude_math).group()
|
|
|
|
+
|
|
|
|
+ page_url = response._url
|
|
|
|
+ item['page_url'] = page_url
|
|
|
|
+
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+
|
|
|
|
+ return item
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class CommunityItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ unit_price = scrapy.Field()
|
|
|
|
+ floating_rate = scrapy.Field()
|
|
|
|
+ built_year = scrapy.Field()
|
|
|
|
+ location = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ type = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response, type):
|
|
|
|
+ item = cls()
|
|
|
|
+ selector = Selector(text=response)
|
|
|
|
+ item['title'] = selector.xpath(r'//h3/a/@title').extract_first()
|
|
|
|
+ item['unit_price'] = selector.xpath(r'//div[@class="li-side"]/p/strong/text()').extract_first()
|
|
|
|
+ floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt"]/text()').extract_first()
|
|
|
|
+ if not floating_rate:
|
|
|
|
+ floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt price-down"]/text()').extract_first()
|
|
|
|
+ item['floating_rate'] = floating_rate
|
|
|
|
+ item['location'] = selector.xpath(r'//div[@class="li-info"]/address/text()').extract_first().strip()
|
|
|
|
+ item['page_url'] = selector.xpath(r'//div[@_soj="xqlb"]/@link').extract_first().strip()
|
|
|
|
+ item['built_year'] = selector.xpath(r'//p[@class="date"]/text()').extract_first().strip()
|
|
|
|
+ item['type'] = type
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+ return item
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class FTXCommunityItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ unit_price = scrapy.Field()
|
|
|
|
+ floating_rate = scrapy.Field()
|
|
|
|
+ year_floating_tare = scrapy.Field()
|
|
|
|
+ built_year = scrapy.Field()
|
|
|
|
+ property = scrapy.Field()
|
|
|
|
+ property_type = scrapy.Field()
|
|
|
|
+ building_type = scrapy.Field()
|
|
|
|
+ greening_rate = scrapy.Field()
|
|
|
|
+ plot_ratio = scrapy.Field()
|
|
|
|
+ total_area = scrapy.Field()
|
|
|
|
+ building_area = scrapy.Field()
|
|
|
|
+ construction = scrapy.Field()
|
|
|
|
+ location = scrapy.Field()
|
|
|
|
+ region = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ img_url = scrapy.Field()
|
|
|
|
+ predict_type = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response):
|
|
|
|
+ item = cls()
|
|
|
|
+ title_string: str = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract_first()
|
|
|
|
+ if title_string:
|
|
|
|
+ item['title'] = title_string.replace('小区网', '')
|
|
|
|
+ item['unit_price'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[1]/dd/span/text()').extract_first()
|
|
|
|
+ item['floating_rate'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[2]/dd/span/text()').extract_first()
|
|
|
|
+ item['year_floating_tare'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[3]/dd/span/text()').extract_first()
|
|
|
|
+
|
|
|
|
+ item['location'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="小区地址:"]/text()').extract_first()
|
|
|
|
+ item['region'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="所属区域:"]/text()').extract_first()
|
|
|
|
+ property_string = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="产权描述:"]/text()').extract_first()
|
|
|
|
+ item['property'] = None
|
|
|
|
+ if property_string:
|
|
|
|
+ re_list = re.search(r'[\d]{1,2}', property_string)
|
|
|
|
+ if re_list:
|
|
|
|
+ item['property'] = re_list.group(0)
|
|
|
|
+
|
|
|
|
+ item['property_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="物业类别:"]/text()').extract_first()
|
|
|
|
+ if not item['property']:
|
|
|
|
+ item['predict_type'] = '其他'
|
|
|
|
+ elif item['property'] == '70':
|
|
|
|
+ item['predict_type'] = '住宅'
|
|
|
|
+ else:
|
|
|
|
+ item['predict_type'] = '公寓'
|
|
|
|
+
|
|
|
|
+ item['construction'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑结构:"]/span/text()').extract_first()
|
|
|
|
+ item['built_year'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑年代:"]/text()').extract_first()
|
|
|
|
+ item['building_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑类型:"]/text()').extract_first()
|
|
|
|
+ item['greening_rate'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="绿 化 率:"]/text()').extract_first()
|
|
|
|
+ item['plot_ratio'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="容 积 率:"]/text()').extract_first()
|
|
|
|
+ item['total_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="占地面积:"]/text()').extract_first()
|
|
|
|
+ item['building_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑面积:"]/text()').extract_first()
|
|
|
|
+ item['img_url'] = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="logopic_sq"]/a/img/@src').extract_first()
|
|
|
|
+ item['page_url'] = response._url
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+
|
|
|
|
+ return item
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class RentalHouseItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ location = scrapy.Field()
|
|
|
|
+ price = scrapy.Field()
|
|
|
|
+ house_type = scrapy.Field()
|
|
|
|
+ area = scrapy.Field()
|
|
|
|
+ orientation = scrapy.Field()
|
|
|
|
+ floor = scrapy.Field()
|
|
|
|
+ decoration = scrapy.Field()
|
|
|
|
+ property_type = scrapy.Field()
|
|
|
|
+ house_code = scrapy.Field()
|
|
|
|
+ publish_date = scrapy.Field()
|
|
|
|
+ longitude = scrapy.Field()
|
|
|
|
+ latitude = scrapy.Field()
|
|
|
|
+ img_url = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ date = scrapy.Field()
|
|
|
|
+ coordinate = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response):
|
|
|
|
+ item = cls()
|
|
|
|
+ name_list: list = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="小区:"]/a/text()').extract()
|
|
|
|
+ location_string = ''
|
|
|
|
+ if name_list:
|
|
|
|
+ if len(name_list) > 0:
|
|
|
|
+ item['title'] = name_list[0]
|
|
|
|
+ if len(name_list) > 1:
|
|
|
|
+ location_string += name_list[1]
|
|
|
|
+ location_string += "-"
|
|
|
|
+ if len(name_list) > 2:
|
|
|
|
+ location_string += name_list[2]
|
|
|
|
+ item['location'] = location_string
|
|
|
|
+ price_list = response.xpath(r'//li[@class="full-line cf"]/span[@class="price"]//text()').extract()
|
|
|
|
+ if price_list:
|
|
|
|
+ item['price'] = "".join(price_list)
|
|
|
|
+ item['house_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="户型:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+ item['area'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="面积:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+ item['orientation'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="朝向:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+ item['floor'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="楼层:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+ item['decoration'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="装修:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+ item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+
|
|
|
|
+ house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first()
|
|
|
|
+ if house_info_string:
|
|
|
|
+ code_match = re.search(r'[\d]{6,}', house_info_string)
|
|
|
|
+ if code_match:
|
|
|
|
+ item['house_code'] = code_match.group()
|
|
|
|
+
|
|
|
|
+ date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_info_string)
|
|
|
|
+ if date_match:
|
|
|
|
+ item['publish_date'] = date_match.group()
|
|
|
|
+
|
|
|
|
+ longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,')
|
|
|
|
+ if longitude_match:
|
|
|
|
+ item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
|
|
|
|
+
|
|
|
|
+ latitude_match = Selector(response).re(r'lat:[\d]{0,2}[\.][\d]*,')
|
|
|
|
+ if latitude_match:
|
|
|
|
+ item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
|
|
|
|
+
|
|
|
|
+ if longitude_match and latitude_match:
|
|
|
|
+ try:
|
|
|
|
+ item['coordinate'] = [float(item['longitude']), float(item['latitude'])]
|
|
|
|
+ except Exception as err:
|
|
|
|
+ logging.error('type conversion error ! reason: ' + '-'.join(err.args))
|
|
|
|
+
|
|
|
|
+ item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img[1]/@data-src').extract_first()
|
|
|
|
+ item['page_url'] = response._url
|
|
|
|
+ item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+
|
|
|
|
+ return item
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class FTXRentalHouseItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ location = scrapy.Field()
|
|
|
|
+ price = scrapy.Field()
|
|
|
|
+ house_type = scrapy.Field()
|
|
|
|
+ area = scrapy.Field()
|
|
|
|
+ orientation = scrapy.Field()
|
|
|
|
+ floor = scrapy.Field()
|
|
|
|
+ decoration = scrapy.Field()
|
|
|
|
+ # property_type = scrapy.Field()
|
|
|
|
+ house_code = scrapy.Field()
|
|
|
|
+ update_date = scrapy.Field()
|
|
|
|
+ # longitude = scrapy.Field()
|
|
|
|
+ # latitude = scrapy.Field()
|
|
|
|
+ img_url = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ date = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response):
|
|
|
|
+ item = cls()
|
|
|
|
+
|
|
|
|
+ house_info_match = Selector(response).re(r'var houseInfo = {[\s\S]*};')
|
|
|
|
+ if house_info_match:
|
|
|
|
+ info_str = house_info_match[0]
|
|
|
|
+ title_match = re.search(r"projname: '[\s\S]*?',", info_str)
|
|
|
|
+ if title_match:
|
|
|
|
+ title_str = title_match[0]
|
|
|
|
+ title_str = title_str.replace("projname: '", '')
|
|
|
|
+ title_str = title_str.replace("',", '')
|
|
|
|
+ item['title'] = title_str
|
|
|
|
+
|
|
|
|
+ district_math = re.search(r"district: '[\s\S]*?',", info_str)
|
|
|
|
+ location_string = ''
|
|
|
|
+ if district_math:
|
|
|
|
+ district_str = district_math[0]
|
|
|
|
+ district_str = district_str.replace("district: '", '')
|
|
|
|
+ district_str = district_str.replace("',", '')
|
|
|
|
+ location_string += district_str
|
|
|
|
+ location_string += '-'
|
|
|
|
+
|
|
|
|
+ comarea_math = re.search(r"comarea: '[\s\S]*?',", info_str)
|
|
|
|
+ if comarea_math:
|
|
|
|
+ comarea_str = comarea_math[0]
|
|
|
|
+ comarea_str = comarea_str.replace("comarea: '", '')
|
|
|
|
+ comarea_str = comarea_str.replace("',", '')
|
|
|
|
+ location_string += comarea_str
|
|
|
|
+
|
|
|
|
+ item['location'] = location_string
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # name_list: list = response.xpath(r'//div[div[@class="lab"][text()="小 区"]]/div[contains(@class, "rcont")]/a/text()').extract()
|
|
|
|
+ # location_string = ''
|
|
|
|
+ # if name_list:
|
|
|
|
+ # if len(name_list) > 0:
|
|
|
|
+ # item['title'] = name_list[0]
|
|
|
|
+ # if len(name_list) > 1:
|
|
|
|
+ # location_string += name_list[1]
|
|
|
|
+ # location_string += "-"
|
|
|
|
+ # if len(name_list) > 2:
|
|
|
|
+ # location_string += name_list[2]
|
|
|
|
+ # item['location'] = location_string
|
|
|
|
+ price_list = response.xpath(r'//div[@class ="tab-cont-right"]/div[@class ="tr-line clearfix zf_new_title"]/div[contains(@class, "trl-item sty1")]//text()').extract()
|
|
|
|
+ if price_list:
|
|
|
|
+ item['price'] = "".join(price_list).strip()
|
|
|
|
+ item['house_type'] = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"]="户型"]/div[@class="tt"]/text()').extract_first()
|
|
|
|
+ item['area'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="建筑面积"]/div[@class="tt"]/text()').extract_first()
|
|
|
|
+ item['orientation'] = response.xpath(r'//div[@class="trl-item1 w146"][div[@class="font14"]="朝向"]/div[@class="tt"]/text()').extract_first()
|
|
|
|
+
|
|
|
|
+ floor_list = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"][contains(text(), "楼层")]]/div//text()').extract()
|
|
|
|
+ if floor_list:
|
|
|
|
+ floor_str = '-'.join(floor_list)
|
|
|
|
+ item['floor'] = floor_str
|
|
|
|
+
|
|
|
|
+ item['decoration'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="装修"]/div[@class="tt"]/text()').extract_first()
|
|
|
|
+ # item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first()
|
|
|
|
+
|
|
|
|
+ # house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first()
|
|
|
|
+ house_code_string = response.xpath(r'//span[contains(text(), "房源编号")]/text()').extract_first()
|
|
|
|
+ if house_code_string:
|
|
|
|
+ code_match = re.search(r'[\d]{6,}', house_code_string)
|
|
|
|
+ if code_match:
|
|
|
|
+ item['house_code'] = code_match.group()
|
|
|
|
+ house_date_string = response.xpath(r'//span[contains(text(), "更新时间")]/text()').extract_first()
|
|
|
|
+ if house_code_string:
|
|
|
|
+ date_match = re.search(r'[\d]{0,4}-[\d]{0,2}-[\d]{0,2}', house_date_string)
|
|
|
|
+ if date_match:
|
|
|
|
+ item['update_date'] = date_match.group()
|
|
|
|
+
|
|
|
|
+ # longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,')
|
|
|
|
+ # if longitude_match:
|
|
|
|
+ # item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
|
|
|
|
+ #
|
|
|
|
+ # latitude_match = Selector(response).re(r'[\d]{0,2}[\.][\d]*,')
|
|
|
|
+ # if latitude_match:
|
|
|
|
+ # item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
|
|
|
|
+
|
|
|
|
+ item['img_url'] = response.xpath(r'//div[@class="bigImg"]/img[1]/@src').extract_first()
|
|
|
|
+ item['page_url'] = response._url
|
|
|
|
+ item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+
|
|
|
|
+ return item
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ResoldHouseItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ location = scrapy.Field()
|
|
|
|
+ price = scrapy.Field()
|
|
|
|
+ house_type = scrapy.Field()
|
|
|
|
+ area = scrapy.Field()
|
|
|
|
+ orientation = scrapy.Field()
|
|
|
|
+ floor = scrapy.Field()
|
|
|
|
+ decoration = scrapy.Field()
|
|
|
|
+ property_type = scrapy.Field()
|
|
|
|
+ total_price = scrapy.Field()
|
|
|
|
+ down_payment = scrapy.Field()
|
|
|
|
+ monthly_payment = scrapy.Field()
|
|
|
|
+ house_code = scrapy.Field()
|
|
|
|
+ publish_date = scrapy.Field()
|
|
|
|
+ house_price_info = scrapy.Field()
|
|
|
|
+ community_price_info = scrapy.Field()
|
|
|
|
+ area_price_info = scrapy.Field()
|
|
|
|
+ longitude = scrapy.Field()
|
|
|
|
+ latitude = scrapy.Field()
|
|
|
|
+ img_url = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ date = scrapy.Field()
|
|
|
|
+ coordinate = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+ build_year = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response):
|
|
|
|
+ item = cls()
|
|
|
|
+
|
|
|
|
+ item['title'] = response.xpath(r'//li[div[text()="所属小区:"]]/div/a/text()').extract_first()
|
|
|
|
+ location_list = response.xpath(r'//li[div[text()="所在位置:"]]/div/p//text()').extract()
|
|
|
|
+ if location_list:
|
|
|
|
+ location = ''.join(location_list).strip()
|
|
|
|
+ location = location.replace('\n', '')
|
|
|
|
+ location = location.replace('\t', '')
|
|
|
|
+ item['location'] = location
|
|
|
|
+
|
|
|
|
+ item['price'] = response.xpath(r'//li[div[text()="房屋单价:"]]/div[2]/text()').extract_first()
|
|
|
|
+ house_type_string: str = response.xpath(r'normalize-space(//li[div[text()="房屋户型:"]]/div[2]/text())').extract_first()
|
|
|
|
+ if house_type_string:
|
|
|
|
+ item['house_type'] = house_type_string.replace(' ', '')
|
|
|
|
+
|
|
|
|
+ item['area'] = response.xpath(r'//li[div[text()="建筑面积:"]]/div[2]/text()').extract_first()
|
|
|
|
+ item['orientation'] = response.xpath(r'//li[div[text()="房屋朝向:"]]/div[2]/text()').extract_first()
|
|
|
|
+ item['floor'] = response.xpath(r'normalize-space(//li[div[text()="所在楼层:"]]/div[2]/text())').extract_first()
|
|
|
|
+ item['decoration'] = response.xpath(r'normalize-space(//li[div[text()="装修程度:"]]/div[2]/text())').extract_first()
|
|
|
|
+ item['property_type'] = response.xpath(r'normalize-space(//li[div[text()="房屋类型:"]]/div[2]/text())').extract_first()
|
|
|
|
+ item['total_price'] = response.xpath(r'//div[@class="wrapper"]/div[@class="wrapper-lf clearfix"]/div[@class="basic-info clearfix"]/span[1]/em/text()').extract_first()
|
|
|
|
+ item['down_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考首付:"]]/div[2]/text())').extract_first()
|
|
|
|
+ item['monthly_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考月供:"]]/div/span/text())').extract_first()
|
|
|
|
+ item['build_year'] = response.xpath(r'normalize-space(//li[div[text()="建造年代:"]]/div[2]/text())').extract_first()
|
|
|
|
+
|
|
|
|
+ house_code_string = response.xpath(r'//span[contains(text(), "房屋编码")]/text()').extract_first()
|
|
|
|
+ if house_code_string:
|
|
|
|
+ code_match = re.search(r'[\d]{6,}', house_code_string)
|
|
|
|
+ if code_match:
|
|
|
|
+ item['house_code'] = code_match.group()
|
|
|
|
+ house_date_string = response.xpath(r'//span[contains(text(), "发布时间")]/text()').extract_first()
|
|
|
|
+ if house_code_string:
|
|
|
|
+ date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_date_string)
|
|
|
|
+ if date_match:
|
|
|
|
+ item['publish_date'] = date_match.group()
|
|
|
|
+
|
|
|
|
+ longitude_match = Selector(response).re(r'lng : "[\d]{0,3}[\.][\d]*"')
|
|
|
|
+ if longitude_match:
|
|
|
|
+ item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
|
|
|
|
+
|
|
|
|
+ latitude_match = Selector(response).re(r'lat : "[\d]{0,2}[\.][\d]*"')
|
|
|
|
+ if latitude_match:
|
|
|
|
+ item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
|
|
|
|
+
|
|
|
|
+ if longitude_match and latitude_match:
|
|
|
|
+ try:
|
|
|
|
+ item['coordinate'] = [float(item['longitude']), float(item['latitude'])]
|
|
|
|
+ except Exception as err:
|
|
|
|
+ logging.error('type conversion error ! reason: ' + '-'.join(err.args))
|
|
|
|
+
|
|
|
|
+ item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img/@data-src').extract_first()
|
|
|
|
+
|
|
|
|
+ item['page_url'] = response._url
|
|
|
|
+ item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+
|
|
|
|
+ return item
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class LfsAveragePriceItem(scrapy.Item):
|
|
|
|
+ _id = scrapy.Field()
|
|
|
|
+ title = scrapy.Field()
|
|
|
|
+ price = scrapy.Field()
|
|
|
|
+ arrow = scrapy.Field()
|
|
|
|
+ rate = scrapy.Field()
|
|
|
|
+ page_url = scrapy.Field()
|
|
|
|
+ date = scrapy.Field()
|
|
|
|
+ house_id = scrapy.Field()
|
|
|
|
+
|
|
|
|
+ @classmethod
|
|
|
|
+ def handle_response(cls, response):
|
|
|
|
+ item = cls()
|
|
|
|
+
|
|
|
|
+ item['title'] = response.xpath(r'//div[@class="comm-title"]/a/@title').extract_first()
|
|
|
|
+ price_math = Selector(response).re(r'(?<="comm_midprice":")([0-9]*(?=","area_midprice))')
|
|
|
|
+ if price_math:
|
|
|
|
+ item['price'] = price_math[0]
|
|
|
|
+ # item['arrow'] = response.xpath(r'//i[@class="arrow"]/text()').extract_first()
|
|
|
|
+ # item['rate'] = response.xpath(r'normalize-space(//span[@class="status level"]/text())').extract_first()
|
|
|
|
+ item['page_url'] = response._url
|
|
|
|
+ item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
+ item['house_id'] = '109'
|
|
|
|
+
|
|
|
|
+ return item
|