# -*- coding:utf-8 -*- # @Time : 2018/4/27 10:50 AM # @Author : Swing import scrapy from scrapy.utils.response import get_base_url from urllib import parse from elabSpider.items import FTXRentalHouseItem import logging import traceback from elabSpider.email_util import send_email class RentalHouseSpider(scrapy.Spider): name = 'ftxrentalHouse' allowed_domains = [ 'zu.nb.fang.com' ] start_urls = [ 'http://zu.nb.fang.com/house/h316-n31/' ] def parse(self, response): try: community_list = response.xpath('//div[@class="houseList"]/dl/dd[@class="info rel"]/p[@class="title"]/a/@href').extract() if community_list: for community_url in community_list: if community_url.startswith('/chuzu/'): yield scrapy.Request(parse.urljoin(get_base_url(response), community_url), callback=self.parse_item) except Exception as err: send_email('ftxrentalHouse lv 1 web parse error', response._url + '\n' + traceback.format_exc()) logging.error(' error ! url: ' + response._url + " reason: " + '-'.join(err.args)) try: next_page = response.xpath(r'//div[@class="fanye"]/a[text()="下一页"]/@href').extract_first() if next_page and next_page.startswith('/house/'): yield scrapy.Request(parse.urljoin(get_base_url(response), next_page), callback=self.parse) except Exception as err: send_email('ftxrentalHouse get next page url error', response._url + '\n' + traceback.format_exc()) logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args)) def parse_item(self, response): try: item = FTXRentalHouseItem.handle_response(response) yield item except Exception as err: send_email('ftxrentalHouse parse response error', response._url + '\n' + traceback.format_exc()) logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))