nb_ftx_rental_house.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. # -*- coding:utf-8 -*-
  2. # @Time : 2018/4/27 10:50 AM
  3. # @Author : Swing
  4. import scrapy
  5. from scrapy.utils.response import get_base_url
  6. from urllib import parse
  7. from elabSpider.items import FTXRentalHouseItem
  8. import logging
  9. import traceback
  10. from elabSpider.email_util import send_email
  11. class RentalHouseSpider(scrapy.Spider):
  12. name = 'ftxrentalHouse'
  13. allowed_domains = [
  14. 'zu.nb.fang.com'
  15. ]
  16. start_urls = [
  17. 'http://zu.nb.fang.com/house/h316-n31/'
  18. ]
  19. def parse(self, response):
  20. try:
  21. community_list = response.xpath('//div[@class="houseList"]/dl/dd[@class="info rel"]/p[@class="title"]/a/@href').extract()
  22. if community_list:
  23. for community_url in community_list:
  24. if community_url.startswith('/chuzu/'):
  25. yield scrapy.Request(parse.urljoin(get_base_url(response), community_url), callback=self.parse_item)
  26. except Exception as err:
  27. send_email('ftxrentalHouse lv 1 web parse error', response._url + '\n' + traceback.format_exc())
  28. logging.error(' error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  29. try:
  30. next_page = response.xpath(r'//div[@class="fanye"]/a[text()="下一页"]/@href').extract_first()
  31. if next_page and next_page.startswith('/house/'):
  32. yield scrapy.Request(parse.urljoin(get_base_url(response), next_page), callback=self.parse)
  33. except Exception as err:
  34. send_email('ftxrentalHouse get next page url error', response._url + '\n' + traceback.format_exc())
  35. logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  36. def parse_item(self, response):
  37. try:
  38. item = FTXRentalHouseItem.handle_response(response)
  39. yield item
  40. except Exception as err:
  41. send_email('ftxrentalHouse parse response error', response._url + '\n' + traceback.format_exc())
  42. logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))