elab-papa
/
spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
							# -*- coding:utf-8 -*-

# @Time    : 2018/4/27 10:50 AM

# @Author  : Swing

import scrapy
from scrapy.utils.response  import get_base_url
from urllib import parse
from elabSpider.items import FTXRentalHouseItem
import logging
import traceback
from elabSpider.email_util import send_email


class RentalHouseSpider(scrapy.Spider):
    name = 'ftxrentalHouse'
    allowed_domains = [
        'zu.nb.fang.com'
    ]
    start_urls = [
        'http://zu.nb.fang.com/house/h316-n31/'
    ]

    def parse(self, response):
        try:
            community_list = response.xpath('//div[@class="houseList"]/dl/dd[@class="info rel"]/p[@class="title"]/a/@href').extract()

            if community_list:
                for community_url in community_list:
                    if community_url.startswith('/chuzu/'):
                        yield scrapy.Request(parse.urljoin(get_base_url(response), community_url), callback=self.parse_item)
        except Exception as err:
            send_email('ftxrentalHouse lv 1 web parse error', response._url + '\n' + traceback.format_exc())
            logging.error(' error ! url: ' + response._url + " reason: " + '-'.join(err.args))

        try:
            next_page = response.xpath(r'//div[@class="fanye"]/a[text()="下一页"]/@href').extract_first()
            if next_page and next_page.startswith('/house/'):
                yield scrapy.Request(parse.urljoin(get_base_url(response), next_page), callback=self.parse)
        except Exception as err:
            send_email('ftxrentalHouse get next page url error', response._url + '\n' + traceback.format_exc())
            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))

    def parse_item(self, response):
        try:
            item = FTXRentalHouseItem.handle_response(response)
            yield item
        except Exception as err:
            send_email('ftxrentalHouse parse response error', response._url + '\n' + traceback.format_exc())
            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))