items.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. # -*- coding: utf-8 -*-
  2. # Define here the models for your scraped items
  3. #
  4. # See documentation in:
  5. # https://doc.scrapy.org/en/latest/topics/items.html
  6. from scrapy.selector import Selector
  7. import scrapy
  8. import re
  9. import time
  10. import logging
  11. class ResoldApartmentItem(scrapy.Item):
  12. _id = scrapy.Field()
  13. title = scrapy.Field()
  14. unit_price = scrapy.Field()
  15. total_price = scrapy.Field()
  16. orientation = scrapy.Field()
  17. area = scrapy.Field()
  18. built_year = scrapy.Field()
  19. property = scrapy.Field()
  20. decoration = scrapy.Field()
  21. model = scrapy.Field()
  22. floor = scrapy.Field()
  23. image = scrapy.Field()
  24. house_type = scrapy.Field()
  25. trading_ownership = scrapy.Field()
  26. tag = scrapy.Field()
  27. location = scrapy.Field()
  28. longitude = scrapy.Field()
  29. latitude = scrapy.Field()
  30. page_url = scrapy.Field()
  31. house_id = scrapy.Field()
  32. @classmethod
  33. def handle_response(cls, response):
  34. item = cls()
  35. item['title'] = response.xpath(
  36. '//ul[@class="house-basic-item3"]/li[1]/span[@class="c_000 mr_10"]/a[1]/text()').extract_first().strip()
  37. unit_price_string = response.xpath('//span[@class="unit"]/text()').extract_first()
  38. item['unit_price'] = re.search(r'[1-9][\d]*', unit_price_string).group()
  39. # 处理总价
  40. total_price_string = response.xpath('//span[@class="price"]/text()').extract_first()
  41. total_price = re.search('[0-9]+(\.)?[0-9]*', total_price_string).group()
  42. price_unit = response.xpath(r'//span[@class="price"]/b/text()', )
  43. if price_unit == '万':
  44. total_price = str(int(total_price) * 10000)
  45. elif price_unit == '千':
  46. total_price = str(int(total_price) * 1000)
  47. item['total_price'] = total_price
  48. item['orientation'] = response.xpath('//p[@class="toward"]/span[@class="main"]/text()').extract_first()
  49. # 建筑年代
  50. built_year_str = response.xpath(r'//p[@class="toward"]/span[@class="sub"]/text()').extract_first()
  51. if built_year_str:
  52. item['built_year'] = re.search(r'[\d]*', built_year_str).group()
  53. area_string = response.xpath(
  54. '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first()
  55. item['area'] = re.search(r'[\d]+', area_string).group()
  56. item['property'] = response.xpath(
  57. '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[3]/span[2]/text()').re(r'[\d]+')[0]
  58. item['decoration'] = response.xpath(
  59. '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[2]/span[2]/text()').extract_first()
  60. item['model'] = response.xpath(
  61. '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first()
  62. item['floor'] = response.xpath(
  63. '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[1]/span[2]/text()').extract_first()
  64. item['house_type'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first()
  65. item['trading_ownership'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first()
  66. # 图片
  67. item['image'] = response.xpath(
  68. r'//div[@class="basic-pic-list pr"]/ul[@id="leftImg"]/li[1]/img/@data-value').extract_first()
  69. # 位置
  70. location_list = response.xpath(r'//ul[@class="house-basic-item3"]/li[2]/span[2]/a/text()').extract()
  71. if location_list:
  72. location_str = '-'.join(location_list)
  73. item['location'] = location_str
  74. # 标签
  75. tag_list = response.xpath(r'//p[@class="house-update-info"]/span[@class="ts"]/text()').extract()
  76. if tag_list:
  77. item['tag'] = tag_list[0]
  78. # 经纬度
  79. script_string = response.xpath(r'//script[@type="text/javascript"]').extract_first()
  80. latitude_math = re.search(r'"lat":([1-9])[\d](\.)[\d]*,"', script_string).group()
  81. longitude_math = re.search(r'"lon":[1-9][\d][\d](\.)[\d]*,"', script_string).group()
  82. item['latitude'] = re.search(r'([1-9])[\d](\.)[\d]*', latitude_math).group()
  83. item['longitude'] = re.search(r'[1-9][\d][\d](\.)[\d]*', longitude_math).group()
  84. page_url = response._url
  85. item['page_url'] = page_url
  86. item['house_id'] = '109'
  87. return item
  88. class CommunityItem(scrapy.Item):
  89. _id = scrapy.Field()
  90. title = scrapy.Field()
  91. unit_price = scrapy.Field()
  92. floating_rate = scrapy.Field()
  93. built_year = scrapy.Field()
  94. location = scrapy.Field()
  95. page_url = scrapy.Field()
  96. type = scrapy.Field()
  97. house_id = scrapy.Field()
  98. @classmethod
  99. def handle_response(cls, response, type):
  100. item = cls()
  101. selector = Selector(text=response)
  102. item['title'] = selector.xpath(r'//h3/a/@title').extract_first()
  103. item['unit_price'] = selector.xpath(r'//div[@class="li-side"]/p/strong/text()').extract_first()
  104. floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt"]/text()').extract_first()
  105. if not floating_rate:
  106. floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt price-down"]/text()').extract_first()
  107. item['floating_rate'] = floating_rate
  108. item['location'] = selector.xpath(r'//div[@class="li-info"]/address/text()').extract_first().strip()
  109. item['page_url'] = selector.xpath(r'//div[@_soj="xqlb"]/@link').extract_first().strip()
  110. item['built_year'] = selector.xpath(r'//p[@class="date"]/text()').extract_first().strip()
  111. item['type'] = type
  112. item['house_id'] = '109'
  113. return item
  114. class FTXCommunityItem(scrapy.Item):
  115. _id = scrapy.Field()
  116. title = scrapy.Field()
  117. unit_price = scrapy.Field()
  118. floating_rate = scrapy.Field()
  119. year_floating_tare = scrapy.Field()
  120. built_year = scrapy.Field()
  121. property = scrapy.Field()
  122. property_type = scrapy.Field()
  123. building_type = scrapy.Field()
  124. greening_rate = scrapy.Field()
  125. plot_ratio = scrapy.Field()
  126. total_area = scrapy.Field()
  127. building_area = scrapy.Field()
  128. construction = scrapy.Field()
  129. location = scrapy.Field()
  130. region = scrapy.Field()
  131. page_url = scrapy.Field()
  132. img_url = scrapy.Field()
  133. predict_type = scrapy.Field()
  134. house_id = scrapy.Field()
  135. @classmethod
  136. def handle_response(cls, response):
  137. item = cls()
  138. title_string: str = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract_first()
  139. if title_string:
  140. item['title'] = title_string.replace('小区网', '')
  141. item['unit_price'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[1]/dd/span/text()').extract_first()
  142. item['floating_rate'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[2]/dd/span/text()').extract_first()
  143. item['year_floating_tare'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[3]/dd/span/text()').extract_first()
  144. item['location'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="小区地址:"]/text()').extract_first()
  145. item['region'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="所属区域:"]/text()').extract_first()
  146. property_string = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="产权描述:"]/text()').extract_first()
  147. item['property'] = None
  148. if property_string:
  149. re_list = re.search(r'[\d]{1,2}', property_string)
  150. if re_list:
  151. item['property'] = re_list.group(0)
  152. item['property_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="物业类别:"]/text()').extract_first()
  153. if not item['property']:
  154. item['predict_type'] = '其他'
  155. elif item['property'] == '70':
  156. item['predict_type'] = '住宅'
  157. else:
  158. item['predict_type'] = '公寓'
  159. item['construction'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑结构:"]/span/text()').extract_first()
  160. item['built_year'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑年代:"]/text()').extract_first()
  161. item['building_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑类型:"]/text()').extract_first()
  162. item['greening_rate'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="绿 化 率:"]/text()').extract_first()
  163. item['plot_ratio'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="容 积 率:"]/text()').extract_first()
  164. item['total_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="占地面积:"]/text()').extract_first()
  165. item['building_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑面积:"]/text()').extract_first()
  166. item['img_url'] = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="logopic_sq"]/a/img/@src').extract_first()
  167. item['page_url'] = response._url
  168. item['house_id'] = '109'
  169. return item
  170. class RentalHouseItem(scrapy.Item):
  171. _id = scrapy.Field()
  172. title = scrapy.Field()
  173. location = scrapy.Field()
  174. price = scrapy.Field()
  175. house_type = scrapy.Field()
  176. area = scrapy.Field()
  177. orientation = scrapy.Field()
  178. floor = scrapy.Field()
  179. decoration = scrapy.Field()
  180. property_type = scrapy.Field()
  181. house_code = scrapy.Field()
  182. publish_date = scrapy.Field()
  183. longitude = scrapy.Field()
  184. latitude = scrapy.Field()
  185. img_url = scrapy.Field()
  186. page_url = scrapy.Field()
  187. date = scrapy.Field()
  188. coordinate = scrapy.Field()
  189. house_id = scrapy.Field()
  190. @classmethod
  191. def handle_response(cls, response):
  192. item = cls()
  193. name_list: list = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="小区:"]/a/text()').extract()
  194. location_string = ''
  195. if name_list:
  196. if len(name_list) > 0:
  197. item['title'] = name_list[0]
  198. if len(name_list) > 1:
  199. location_string += name_list[1]
  200. location_string += "-"
  201. if len(name_list) > 2:
  202. location_string += name_list[2]
  203. item['location'] = location_string
  204. price_list = response.xpath(r'//li[@class="full-line cf"]/span[@class="price"]//text()').extract()
  205. if price_list:
  206. item['price'] = "".join(price_list)
  207. item['house_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="户型:"]/span[@class="info"]/text()').extract_first()
  208. item['area'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="面积:"]/span[@class="info"]/text()').extract_first()
  209. item['orientation'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="朝向:"]/span[@class="info"]/text()').extract_first()
  210. item['floor'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="楼层:"]/span[@class="info"]/text()').extract_first()
  211. item['decoration'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="装修:"]/span[@class="info"]/text()').extract_first()
  212. item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first()
  213. house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first()
  214. if house_info_string:
  215. code_match = re.search(r'[\d]{6,}', house_info_string)
  216. if code_match:
  217. item['house_code'] = code_match.group()
  218. date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_info_string)
  219. if date_match:
  220. item['publish_date'] = date_match.group()
  221. longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,')
  222. if longitude_match:
  223. item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
  224. latitude_match = Selector(response).re(r'lat:[\d]{0,2}[\.][\d]*,')
  225. if latitude_match:
  226. item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
  227. if longitude_match and latitude_match:
  228. try:
  229. item['coordinate'] = [float(item['longitude']), float(item['latitude'])]
  230. except Exception as err:
  231. logging.error('type conversion error ! reason: ' + '-'.join(err.args))
  232. item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img[1]/@data-src').extract_first()
  233. item['page_url'] = response._url
  234. item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
  235. item['house_id'] = '109'
  236. return item
  237. class FTXRentalHouseItem(scrapy.Item):
  238. _id = scrapy.Field()
  239. title = scrapy.Field()
  240. location = scrapy.Field()
  241. price = scrapy.Field()
  242. house_type = scrapy.Field()
  243. area = scrapy.Field()
  244. orientation = scrapy.Field()
  245. floor = scrapy.Field()
  246. decoration = scrapy.Field()
  247. # property_type = scrapy.Field()
  248. house_code = scrapy.Field()
  249. update_date = scrapy.Field()
  250. # longitude = scrapy.Field()
  251. # latitude = scrapy.Field()
  252. img_url = scrapy.Field()
  253. page_url = scrapy.Field()
  254. date = scrapy.Field()
  255. house_id = scrapy.Field()
  256. @classmethod
  257. def handle_response(cls, response):
  258. item = cls()
  259. house_info_match = Selector(response).re(r'var houseInfo = {[\s\S]*};')
  260. if house_info_match:
  261. info_str = house_info_match[0]
  262. title_match = re.search(r"projname: '[\s\S]*?',", info_str)
  263. if title_match:
  264. title_str = title_match[0]
  265. title_str = title_str.replace("projname: '", '')
  266. title_str = title_str.replace("',", '')
  267. item['title'] = title_str
  268. district_math = re.search(r"district: '[\s\S]*?',", info_str)
  269. location_string = ''
  270. if district_math:
  271. district_str = district_math[0]
  272. district_str = district_str.replace("district: '", '')
  273. district_str = district_str.replace("',", '')
  274. location_string += district_str
  275. location_string += '-'
  276. comarea_math = re.search(r"comarea: '[\s\S]*?',", info_str)
  277. if comarea_math:
  278. comarea_str = comarea_math[0]
  279. comarea_str = comarea_str.replace("comarea: '", '')
  280. comarea_str = comarea_str.replace("',", '')
  281. location_string += comarea_str
  282. item['location'] = location_string
  283. # name_list: list = response.xpath(r'//div[div[@class="lab"][text()="小      区"]]/div[contains(@class, "rcont")]/a/text()').extract()
  284. # location_string = ''
  285. # if name_list:
  286. # if len(name_list) > 0:
  287. # item['title'] = name_list[0]
  288. # if len(name_list) > 1:
  289. # location_string += name_list[1]
  290. # location_string += "-"
  291. # if len(name_list) > 2:
  292. # location_string += name_list[2]
  293. # item['location'] = location_string
  294. price_list = response.xpath(r'//div[@class ="tab-cont-right"]/div[@class ="tr-line clearfix zf_new_title"]/div[contains(@class, "trl-item sty1")]//text()').extract()
  295. if price_list:
  296. item['price'] = "".join(price_list).strip()
  297. item['house_type'] = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"]="户型"]/div[@class="tt"]/text()').extract_first()
  298. item['area'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="建筑面积"]/div[@class="tt"]/text()').extract_first()
  299. item['orientation'] = response.xpath(r'//div[@class="trl-item1 w146"][div[@class="font14"]="朝向"]/div[@class="tt"]/text()').extract_first()
  300. floor_list = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"][contains(text(), "楼层")]]/div//text()').extract()
  301. if floor_list:
  302. floor_str = '-'.join(floor_list)
  303. item['floor'] = floor_str
  304. item['decoration'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="装修"]/div[@class="tt"]/text()').extract_first()
  305. # item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first()
  306. # house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first()
  307. house_code_string = response.xpath(r'//span[contains(text(), "房源编号")]/text()').extract_first()
  308. if house_code_string:
  309. code_match = re.search(r'[\d]{6,}', house_code_string)
  310. if code_match:
  311. item['house_code'] = code_match.group()
  312. house_date_string = response.xpath(r'//span[contains(text(), "更新时间")]/text()').extract_first()
  313. if house_code_string:
  314. date_match = re.search(r'[\d]{0,4}-[\d]{0,2}-[\d]{0,2}', house_date_string)
  315. if date_match:
  316. item['update_date'] = date_match.group()
  317. # longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,')
  318. # if longitude_match:
  319. # item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
  320. #
  321. # latitude_match = Selector(response).re(r'[\d]{0,2}[\.][\d]*,')
  322. # if latitude_match:
  323. # item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
  324. item['img_url'] = response.xpath(r'//div[@class="bigImg"]/img[1]/@src').extract_first()
  325. item['page_url'] = response._url
  326. item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
  327. item['house_id'] = '109'
  328. return item
  329. class ResoldHouseItem(scrapy.Item):
  330. _id = scrapy.Field()
  331. title = scrapy.Field()
  332. location = scrapy.Field()
  333. price = scrapy.Field()
  334. house_type = scrapy.Field()
  335. area = scrapy.Field()
  336. orientation = scrapy.Field()
  337. floor = scrapy.Field()
  338. decoration = scrapy.Field()
  339. property_type = scrapy.Field()
  340. total_price = scrapy.Field()
  341. down_payment = scrapy.Field()
  342. monthly_payment = scrapy.Field()
  343. house_code = scrapy.Field()
  344. publish_date = scrapy.Field()
  345. house_price_info = scrapy.Field()
  346. community_price_info = scrapy.Field()
  347. area_price_info = scrapy.Field()
  348. longitude = scrapy.Field()
  349. latitude = scrapy.Field()
  350. img_url = scrapy.Field()
  351. page_url = scrapy.Field()
  352. date = scrapy.Field()
  353. coordinate = scrapy.Field()
  354. house_id = scrapy.Field()
  355. build_year = scrapy.Field()
  356. @classmethod
  357. def handle_response(cls, response):
  358. item = cls()
  359. item['title'] = response.xpath(r'//li[div[text()="所属小区:"]]/div/a/text()').extract_first()
  360. location_list = response.xpath(r'//li[div[text()="所在位置:"]]/div/p//text()').extract()
  361. if location_list:
  362. location = ''.join(location_list).strip()
  363. location = location.replace('\n', '')
  364. location = location.replace('\t', '')
  365. item['location'] = location
  366. item['price'] = response.xpath(r'//li[div[text()="房屋单价:"]]/div[2]/text()').extract_first()
  367. house_type_string: str = response.xpath(r'normalize-space(//li[div[text()="房屋户型:"]]/div[2]/text())').extract_first()
  368. if house_type_string:
  369. item['house_type'] = house_type_string.replace(' ', '')
  370. item['area'] = response.xpath(r'//li[div[text()="建筑面积:"]]/div[2]/text()').extract_first()
  371. item['orientation'] = response.xpath(r'//li[div[text()="房屋朝向:"]]/div[2]/text()').extract_first()
  372. item['floor'] = response.xpath(r'normalize-space(//li[div[text()="所在楼层:"]]/div[2]/text())').extract_first()
  373. item['decoration'] = response.xpath(r'normalize-space(//li[div[text()="装修程度:"]]/div[2]/text())').extract_first()
  374. item['property_type'] = response.xpath(r'normalize-space(//li[div[text()="房屋类型:"]]/div[2]/text())').extract_first()
  375. item['total_price'] = response.xpath(r'//div[@class="wrapper"]/div[@class="wrapper-lf clearfix"]/div[@class="basic-info clearfix"]/span[1]/em/text()').extract_first()
  376. item['down_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考首付:"]]/div[2]/text())').extract_first()
  377. item['monthly_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考月供:"]]/div/span/text())').extract_first()
  378. item['build_year'] = response.xpath(r'normalize-space(//li[div[text()="建造年代:"]]/div[2]/text())').extract_first()
  379. house_code_string = response.xpath(r'//span[contains(text(), "房屋编码")]/text()').extract_first()
  380. if house_code_string:
  381. code_match = re.search(r'[\d]{6,}', house_code_string)
  382. if code_match:
  383. item['house_code'] = code_match.group()
  384. house_date_string = response.xpath(r'//span[contains(text(), "发布时间")]/text()').extract_first()
  385. if house_code_string:
  386. date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_date_string)
  387. if date_match:
  388. item['publish_date'] = date_match.group()
  389. longitude_match = Selector(response).re(r'lng : "[\d]{0,3}[\.][\d]*"')
  390. if longitude_match:
  391. item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
  392. latitude_match = Selector(response).re(r'lat : "[\d]{0,2}[\.][\d]*"')
  393. if latitude_match:
  394. item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
  395. if longitude_match and latitude_match:
  396. try:
  397. item['coordinate'] = [float(item['longitude']), float(item['latitude'])]
  398. except Exception as err:
  399. logging.error('type conversion error ! reason: ' + '-'.join(err.args))
  400. item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img/@data-src').extract_first()
  401. item['page_url'] = response._url
  402. item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
  403. item['house_id'] = '109'
  404. return item
  405. class LfsAveragePriceItem(scrapy.Item):
  406. _id = scrapy.Field()
  407. title = scrapy.Field()
  408. price = scrapy.Field()
  409. arrow = scrapy.Field()
  410. rate = scrapy.Field()
  411. page_url = scrapy.Field()
  412. date = scrapy.Field()
  413. house_id = scrapy.Field()
  414. @classmethod
  415. def handle_response(cls, response):
  416. item = cls()
  417. item['title'] = response.xpath(r'//div[@class="comm-title"]/a/@title').extract_first()
  418. price_math = Selector(response).re(r'(?<="comm_midprice":")([0-9]*(?=","area_midprice))')
  419. if price_math:
  420. item['price'] = price_math[0]
  421. # item['arrow'] = response.xpath(r'//i[@class="arrow"]/text()').extract_first()
  422. # item['rate'] = response.xpath(r'normalize-space(//span[@class="status level"]/text())').extract_first()
  423. item['page_url'] = response._url
  424. item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
  425. item['house_id'] = '109'
  426. return item