123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # -*- coding:utf-8 -*-
- # @Time : 2018/4/27 10:50 AM
- # @Author : Swing
- import scrapy
- from elabSpider.items import CommunityItem
- import traceback
- from elabSpider.email_util import send_email
- class DepartmentPriceSpider(scrapy.Spider):
- name = 'departmentPrice'
- allowed_domains = [
- 'hangzhou.anjuke.com',
- 'suzhou.zuanjuke.com',
- 'nb.anjuke.com'
- ]
- start_urls = [
- 'https://shanghai.anjuke.com/community/xujiahui/',
- 'https://shanghai.anjuke.com/community/nanjingxilu/',
- 'https://shanghai.anjuke.com/community/jingansi/',
- 'https://shanghai.anjuke.com/community/lujiazui/',
- 'https://shanghai.anjuke.com/community/nanjingdonglu/',
- 'https://shanghai.anjuke.com/community/renminguangchang/',
- 'https://shanghai.anjuke.com/community/xintiandia/',
- 'https://hangzhou.anjuke.com/community/gulouy/t30/',
- 'https://hangzhou.anjuke.com/community/hubin/t30/',
- 'https://hangzhou.anjuke.com/community/wushana/t30/',
- 'https://hangzhou.anjuke.com/community/wulin/t30/',
- 'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t30/',
- 'https://hangzhou.anjuke.com/community/qianjiangxincheng/t30/',
- 'https://suzhou.anjuke.com/community/guanqianjie/t34/',
- 'https://suzhou.anjuke.com/community/pingjianglua/t34/',
- 'https://suzhou.anjuke.com/community/shilus/t34/',
- 'https://suzhou.anjuke.com/community/shishanw/t34/',
- 'https://suzhou.anjuke.com/community/hudongs/t34/',
- 'https://suzhou.anjuke.com/community/huxi/t34/',
- 'https://nb.anjuke.com/community/tianyiguangchang/t107/',
- 'https://nb.anjuke.com/community/gulouh/t107/',
- 'https://nb.anjuke.com/community/dongbuxinchengw/t107/',
- 'https://nb.anjuke.com/community/baizhangt/t107/',
- 'https://nb.anjuke.com/community/zhongma/t107/',
- 'https://hangzhou.anjuke.com/community/gulouy/t29/',
- 'https://hangzhou.anjuke.com/community/hubin/t29/',
- 'https://hangzhou.anjuke.com/community/wushana/t29/',
- 'https://hangzhou.anjuke.com/community/wulin/t29/',
- 'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t29/',
- 'https://hangzhou.anjuke.com/community/qianjiangxincheng/t29/',
- 'https://suzhou.anjuke.com/community/guanqianjie/t33/',
- 'https://suzhou.anjuke.com/community/pingjianglua/t33/',
- 'https://suzhou.anjuke.com/community/shilus/t33/',
- 'https://suzhou.anjuke.com/community/shishanw/t33/',
- 'https://suzhou.anjuke.com/community/hudongs/t33/',
- 'https://suzhou.anjuke.com/community/huxi/t33/',
- 'https://nb.anjuke.com/community/tianyiguangchang/t105/',
- 'https://nb.anjuke.com/community/gulouh/t105/',
- 'https://nb.anjuke.com/community/dongbuxinchengw/t105/',
- 'https://nb.anjuke.com/community/baizhangt/t105/',
- 'https://nb.anjuke.com/community/zhongma/t105/'
- ]
- def parse(self, response):
- try:
- community_list = response.xpath('//div[@class="maincontent"]/div[@class="list-content"]/div[@_soj="xqlb"]').extract()
- # house_type = ''
- house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l pp-mod"]/a[@class="selected-item"]/text()').extract_first()
- if not house_type:
- house_type = response.xpath('//div[@class="items no-border-bottom"]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first()
- if not house_type:
- house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first()
- # if not house_type:
- # print('error')
- if community_list:
- for community in community_list:
- item = CommunityItem.handle_response(community, house_type)
- yield item
- except:
- send_email('departmentPrice lv 1 web parse error', response._url + '\n' + traceback.format_exc())
- print('error info: ', response._url())
- try:
- next_page = response.xpath(r'//div[@class="page-content"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
- if next_page:
- yield scrapy.Request(next_page, callback=self.parse)
- except:
- send_email('departmentPrice get next page url error', response._url + '\n' + traceback.format_exc())
- print('error info: ')
|