departmentprice.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding:utf-8 -*-
  2. # @Time : 2018/4/27 10:50 AM
  3. # @Author : Swing
  4. import scrapy
  5. from elabSpider.items import CommunityItem
  6. import traceback
  7. from elabSpider.email_util import send_email
  8. class DepartmentPriceSpider(scrapy.Spider):
  9. name = 'departmentPrice'
  10. allowed_domains = [
  11. 'hangzhou.anjuke.com',
  12. 'suzhou.zuanjuke.com',
  13. 'nb.anjuke.com'
  14. ]
  15. start_urls = [
  16. 'https://shanghai.anjuke.com/community/xujiahui/',
  17. 'https://shanghai.anjuke.com/community/nanjingxilu/',
  18. 'https://shanghai.anjuke.com/community/jingansi/',
  19. 'https://shanghai.anjuke.com/community/lujiazui/',
  20. 'https://shanghai.anjuke.com/community/nanjingdonglu/',
  21. 'https://shanghai.anjuke.com/community/renminguangchang/',
  22. 'https://shanghai.anjuke.com/community/xintiandia/',
  23. 'https://hangzhou.anjuke.com/community/gulouy/t30/',
  24. 'https://hangzhou.anjuke.com/community/hubin/t30/',
  25. 'https://hangzhou.anjuke.com/community/wushana/t30/',
  26. 'https://hangzhou.anjuke.com/community/wulin/t30/',
  27. 'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t30/',
  28. 'https://hangzhou.anjuke.com/community/qianjiangxincheng/t30/',
  29. 'https://suzhou.anjuke.com/community/guanqianjie/t34/',
  30. 'https://suzhou.anjuke.com/community/pingjianglua/t34/',
  31. 'https://suzhou.anjuke.com/community/shilus/t34/',
  32. 'https://suzhou.anjuke.com/community/shishanw/t34/',
  33. 'https://suzhou.anjuke.com/community/hudongs/t34/',
  34. 'https://suzhou.anjuke.com/community/huxi/t34/',
  35. 'https://nb.anjuke.com/community/tianyiguangchang/t107/',
  36. 'https://nb.anjuke.com/community/gulouh/t107/',
  37. 'https://nb.anjuke.com/community/dongbuxinchengw/t107/',
  38. 'https://nb.anjuke.com/community/baizhangt/t107/',
  39. 'https://nb.anjuke.com/community/zhongma/t107/',
  40. 'https://hangzhou.anjuke.com/community/gulouy/t29/',
  41. 'https://hangzhou.anjuke.com/community/hubin/t29/',
  42. 'https://hangzhou.anjuke.com/community/wushana/t29/',
  43. 'https://hangzhou.anjuke.com/community/wulin/t29/',
  44. 'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t29/',
  45. 'https://hangzhou.anjuke.com/community/qianjiangxincheng/t29/',
  46. 'https://suzhou.anjuke.com/community/guanqianjie/t33/',
  47. 'https://suzhou.anjuke.com/community/pingjianglua/t33/',
  48. 'https://suzhou.anjuke.com/community/shilus/t33/',
  49. 'https://suzhou.anjuke.com/community/shishanw/t33/',
  50. 'https://suzhou.anjuke.com/community/hudongs/t33/',
  51. 'https://suzhou.anjuke.com/community/huxi/t33/',
  52. 'https://nb.anjuke.com/community/tianyiguangchang/t105/',
  53. 'https://nb.anjuke.com/community/gulouh/t105/',
  54. 'https://nb.anjuke.com/community/dongbuxinchengw/t105/',
  55. 'https://nb.anjuke.com/community/baizhangt/t105/',
  56. 'https://nb.anjuke.com/community/zhongma/t105/'
  57. ]
  58. def parse(self, response):
  59. try:
  60. community_list = response.xpath('//div[@class="maincontent"]/div[@class="list-content"]/div[@_soj="xqlb"]').extract()
  61. # house_type = ''
  62. house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l pp-mod"]/a[@class="selected-item"]/text()').extract_first()
  63. if not house_type:
  64. house_type = response.xpath('//div[@class="items no-border-bottom"]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first()
  65. if not house_type:
  66. house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first()
  67. # if not house_type:
  68. # print('error')
  69. if community_list:
  70. for community in community_list:
  71. item = CommunityItem.handle_response(community, house_type)
  72. yield item
  73. except:
  74. send_email('departmentPrice lv 1 web parse error', response._url + '\n' + traceback.format_exc())
  75. print('error info: ', response._url())
  76. try:
  77. next_page = response.xpath(r'//div[@class="page-content"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
  78. if next_page:
  79. yield scrapy.Request(next_page, callback=self.parse)
  80. except:
  81. send_email('departmentPrice get next page url error', response._url + '\n' + traceback.format_exc())
  82. print('error info: ')