sjk_rental_house.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. # -*- coding:utf-8 -*-
  2. # @Time : 2018/4/27 10:50 AM
  3. # @Author : Swing
  4. import scrapy
  5. from elabSpider.items import RentalHouseItem
  6. import logging
  7. import traceback
  8. from elabSpider.email_util import send_email
  9. class RentalHouseSpider(scrapy.Spider):
  10. name = 'sjkrentalHouse'
  11. allowed_domains = [
  12. 'nb.zu.anjuke.com',
  13. 'nb.anjuke.com'
  14. ]
  15. start_urls = [
  16. 'https://nb.anjuke.com/community/props/rent/275642/lx1-x1/',
  17. 'https://nb.anjuke.com/community/props/rent/275642/lx8-x1/',
  18. 'https://nb.anjuke.com/community/props/rent/1003094/lx1-x1/',
  19. 'https://nb.anjuke.com/community/props/rent/1003094/lx8-x1/',
  20. 'https://nb.anjuke.com/community/props/rent/275869/lx1-x1/',
  21. 'https://nb.anjuke.com/community/props/rent/275869/lx8-x1/',
  22. 'https://nb.anjuke.com/community/props/rent/973807/lx1-x1/',
  23. 'https://nb.anjuke.com/community/props/rent/973807/lx8-x1/',
  24. 'https://nb.anjuke.com/community/props/rent/973808/lx1-x1/',
  25. 'https://nb.anjuke.com/community/props/rent/973808/lx8-x1/',
  26. 'https://nb.anjuke.com/community/props/rent/275517/lx1-x1/',
  27. 'https://nb.anjuke.com/community/props/rent/275517/lx8-x1/',
  28. 'https://nb.anjuke.com/community/props/rent/1000067/lx1-x1/',
  29. 'https://nb.anjuke.com/community/props/rent/1000067/lx8-x1/',
  30. 'https://nb.anjuke.com/community/props/rent/406899/lx1-x1/',
  31. 'https://nb.anjuke.com/community/props/rent/406899/lx8-x1/',
  32. 'https://nb.anjuke.com/community/props/rent/1016525/lx1-x1/',
  33. 'https://nb.anjuke.com/community/props/rent/1016525/lx8-x1/',
  34. 'https://nb.anjuke.com/community/props/rent/275936/lx1-x1/',
  35. 'https://nb.anjuke.com/community/props/rent/275936/lx8-x1/',
  36. 'https://nb.anjuke.com/community/props/rent/1017728/lx1-x1/',
  37. 'https://nb.anjuke.com/community/props/rent/1017728/lx8-x1/',
  38. 'https://nb.anjuke.com/community/props/rent/275274/lx1-x1/',
  39. 'https://nb.anjuke.com/community/props/rent/275274/lx8-x1/',
  40. 'https://nb.anjuke.com/community/props/rent/275658/lx1-x1/',
  41. 'https://nb.anjuke.com/community/props/rent/275658/lx8-x1/',
  42. 'https://nb.anjuke.com/community/props/rent/275386/lx1-x1/',
  43. 'https://nb.anjuke.com/community/props/rent/275386/lx8-x1/',
  44. 'https://nb.anjuke.com/community/props/rent/1006982/lx1-x1/',
  45. 'https://nb.anjuke.com/community/props/rent/1006982/lx8-x1/',
  46. 'https://nb.anjuke.com/community/props/rent/275764/lx1-x1/',
  47. 'https://nb.anjuke.com/community/props/rent/275764/lx8-x1/',
  48. 'https://nb.anjuke.com/community/props/rent/792725/lx1-x1/',
  49. 'https://nb.anjuke.com/community/props/rent/792725/lx8-x1/',
  50. 'https://nb.anjuke.com/community/props/rent/1022250/lx1-x1/',
  51. 'https://nb.anjuke.com/community/props/rent/1022250/lx8-x1/'
  52. ]
  53. def parse(self, response):
  54. try:
  55. community_list = response.xpath('//ul[@class="m-house-list"]/li/a/@href').extract()
  56. if community_list:
  57. for community_url in community_list:
  58. yield scrapy.Request(community_url, callback=self.parse_item)
  59. except Exception as err:
  60. send_email('sjkrentalHouse get detail url error', response._url + '\n' + traceback.format_exc())
  61. logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  62. try:
  63. next_page = response.xpath(r'//div[@class="m-page"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
  64. if next_page:
  65. yield scrapy.Request(next_page, callback=self.parse)
  66. except Exception as err:
  67. send_email('sjkrentalHouse get next page url parse error', response._url + '\n' + traceback.format_exc())
  68. logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
  69. def parse_item(self, response):
  70. try:
  71. item = RentalHouseItem.handle_response(response)
  72. yield item
  73. except Exception as err:
  74. send_email('sjkrentalHouse parse response error', response._url + '\n' + traceback.format_exc())
  75. logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))