# -*- coding:utf-8 -*- # @Time : 2018/4/27 10:50 AM # @Author : Swing import scrapy from elabSpider.items import CommunityItem import traceback from elabSpider.email_util import send_email class DepartmentPriceSpider(scrapy.Spider): name = 'departmentPrice' allowed_domains = [ 'hangzhou.anjuke.com', 'suzhou.zuanjuke.com', 'nb.anjuke.com' ] start_urls = [ 'https://shanghai.anjuke.com/community/xujiahui/', 'https://shanghai.anjuke.com/community/nanjingxilu/', 'https://shanghai.anjuke.com/community/jingansi/', 'https://shanghai.anjuke.com/community/lujiazui/', 'https://shanghai.anjuke.com/community/nanjingdonglu/', 'https://shanghai.anjuke.com/community/renminguangchang/', 'https://shanghai.anjuke.com/community/xintiandia/', 'https://hangzhou.anjuke.com/community/gulouy/t30/', 'https://hangzhou.anjuke.com/community/hubin/t30/', 'https://hangzhou.anjuke.com/community/wushana/t30/', 'https://hangzhou.anjuke.com/community/wulin/t30/', 'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t30/', 'https://hangzhou.anjuke.com/community/qianjiangxincheng/t30/', 'https://suzhou.anjuke.com/community/guanqianjie/t34/', 'https://suzhou.anjuke.com/community/pingjianglua/t34/', 'https://suzhou.anjuke.com/community/shilus/t34/', 'https://suzhou.anjuke.com/community/shishanw/t34/', 'https://suzhou.anjuke.com/community/hudongs/t34/', 'https://suzhou.anjuke.com/community/huxi/t34/', 'https://nb.anjuke.com/community/tianyiguangchang/t107/', 'https://nb.anjuke.com/community/gulouh/t107/', 'https://nb.anjuke.com/community/dongbuxinchengw/t107/', 'https://nb.anjuke.com/community/baizhangt/t107/', 'https://nb.anjuke.com/community/zhongma/t107/', 'https://hangzhou.anjuke.com/community/gulouy/t29/', 'https://hangzhou.anjuke.com/community/hubin/t29/', 'https://hangzhou.anjuke.com/community/wushana/t29/', 'https://hangzhou.anjuke.com/community/wulin/t29/', 'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t29/', 'https://hangzhou.anjuke.com/community/qianjiangxincheng/t29/', 'https://suzhou.anjuke.com/community/guanqianjie/t33/', 'https://suzhou.anjuke.com/community/pingjianglua/t33/', 'https://suzhou.anjuke.com/community/shilus/t33/', 'https://suzhou.anjuke.com/community/shishanw/t33/', 'https://suzhou.anjuke.com/community/hudongs/t33/', 'https://suzhou.anjuke.com/community/huxi/t33/', 'https://nb.anjuke.com/community/tianyiguangchang/t105/', 'https://nb.anjuke.com/community/gulouh/t105/', 'https://nb.anjuke.com/community/dongbuxinchengw/t105/', 'https://nb.anjuke.com/community/baizhangt/t105/', 'https://nb.anjuke.com/community/zhongma/t105/' ] def parse(self, response): try: community_list = response.xpath('//div[@class="maincontent"]/div[@class="list-content"]/div[@_soj="xqlb"]').extract() # house_type = '' house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l pp-mod"]/a[@class="selected-item"]/text()').extract_first() if not house_type: house_type = response.xpath('//div[@class="items no-border-bottom"]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first() if not house_type: house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first() # if not house_type: # print('error') if community_list: for community in community_list: item = CommunityItem.handle_response(community, house_type) yield item except: send_email('departmentPrice lv 1 web parse error', response._url + '\n' + traceback.format_exc()) print('error info: ', response._url()) try: next_page = response.xpath(r'//div[@class="page-content"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first() if next_page: yield scrapy.Request(next_page, callback=self.parse) except: send_email('departmentPrice get next page url error', response._url + '\n' + traceback.format_exc()) print('error info: ')