nb_rental_house.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. # -*- coding:utf-8 -*-
  2. # @Time : 2018/4/27 10:50 AM
  3. # @Author : Swing
  4. import scrapy
  5. from elabSpider.items import RentalHouseItem
  6. import logging
  7. import traceback
  8. from elabSpider.email_util import send_email
  9. class RentalHouseSpider(scrapy.Spider):
  10. name = 'rentalHouse'
  11. allowed_domains = [
  12. 'nb.zu.anjuke.com'
  13. ]
  14. start_urls = [
  15. 'https://nb.zu.anjuke.com/fangyuan/lx8-px3-x1/',
  16. 'https://nb.zu.anjuke.com/fangyuan/lx1-px3-x1/'
  17. ]
  18. def parse(self, response):
  19. try:
  20. community_list = response.xpath('//div[@class="maincontent"]/div[@class="list-content"]/div[contains(@class, "zu-itemmod")]/a/@href').extract()
  21. if community_list:
  22. for community_url in community_list:
  23. yield scrapy.Request(community_url, callback=self.parse_item)
  24. except Exception as err:
  25. send_email('rentalHouse get detail url error', response._url + '\n' + traceback.format_exc())
  26. logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  27. try:
  28. next_page = response.xpath(r'//div[@class="page-content"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
  29. if next_page:
  30. yield scrapy.Request(next_page, callback=self.parse)
  31. except Exception as err:
  32. send_email('rentalHouse get next page url error', response._url + '\n' + traceback.format_exc())
  33. logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  34. def parse_item(self, response):
  35. try:
  36. item = RentalHouseItem.handle_response(response)
  37. yield item
  38. except Exception as err:
  39. send_email('rentalHouse parse response error', response._url + '\n' + traceback.format_exc())
  40. logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))