sjk_resold.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # -*- coding:utf-8 -*-
  2. # @Time : 2018/4/27 10:50 AM
  3. # @Author : Swing
  4. import scrapy
  5. from elabSpider.items import ResoldHouseItem
  6. import logging
  7. import traceback
  8. from elabSpider.email_util import send_email
  9. class RentalHouseSpider(scrapy.Spider):
  10. name = 'sjkresoldHouse'
  11. allowed_domains = [
  12. 'nb.anjuke.com'
  13. ]
  14. start_urls = [
  15. 'https://nb.anjuke.com/community/props/sale/275642/',
  16. 'https://nb.anjuke.com/community/props/sale/1003094/',
  17. 'https://nb.anjuke.com/community/props/sale/275869/',
  18. 'https://nb.anjuke.com/community/props/sale/973807/',
  19. 'https://nb.anjuke.com/community/props/sale/973808/',
  20. 'https://nb.anjuke.com/community/props/sale/275517/',
  21. 'https://nb.anjuke.com/community/props/sale/1000067/',
  22. 'https://nb.anjuke.com/community/props/sale/406899/',
  23. 'https://nb.anjuke.com/community/props/sale/1016525/',
  24. 'https://nb.anjuke.com/community/props/sale/275936/',
  25. 'https://nb.anjuke.com/community/props/sale/1017728/',
  26. 'https://nb.anjuke.com/community/props/sale/275274/',
  27. 'https://nb.anjuke.com/community/props/sale/275658/',
  28. 'https://nb.anjuke.com/community/props/sale/275386/',
  29. 'https://nb.anjuke.com/community/props/sale/1006982/',
  30. 'https://nb.anjuke.com/community/props/sale/275764/',
  31. 'https://nb.anjuke.com/community/props/sale/792725/',
  32. 'https://nb.anjuke.com/community/props/sale/1022250/'
  33. # 'https://nb.anjuke.com/prop/view/A1237992888?from=filter&spread=filtersearch_p&position=117&kwtype=filter&now_time=1526637680'
  34. ]
  35. def parse(self, response):
  36. try:
  37. community_list = response.xpath('//ul[@class="m-house-list"]/li/a/@href').extract()
  38. if community_list:
  39. for community_url in community_list:
  40. yield scrapy.Request(community_url, callback=self.parse_item)
  41. except Exception as err:
  42. send_email('sjkresoldHouse get detail url error', response._url + '\n' + traceback.format_exc())
  43. logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  44. try:
  45. next_page = response.xpath(r'//div[@class="m-page"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
  46. if next_page:
  47. yield scrapy.Request(next_page, callback=self.parse)
  48. except Exception as err:
  49. send_email('sjkresoldHouse get next page url error', response._url + '\n' + traceback.format_exc())
  50. logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  51. def parse_item(self, response):
  52. try:
  53. item = ResoldHouseItem.handle_response(response)
  54. yield item
  55. except Exception as err:
  56. send_email('sjkresoldHouse parse response error', response._url + '\n' + traceback.format_exc())
  57. logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))