# -*- coding: utf-8 -*- from elabSpider.items import * from scrapy.utils.response import get_base_url from urllib import parse import logging import scrapy.core.engine from elabSpider.email_util import send_email import traceback class ExampleSpider(scrapy.Spider): name = 'fangtianxia' allowed_domains = ['fang.com/'] start_urls = [ 'http://esf.sh.fang.com/housing/25_1633_1_0_0_0_1_0_0_0/', 'http://esf.sh.fang.com/housing/19_103_1_0_0_0_1_0_0_0/', 'http://esf.sh.fang.com/housing/21_1622_1_0_0_0_1_0_0_0/', 'http://esf.sh.fang.com/housing/21_1623_1_0_0_0_1_0_0_0/', 'http://esf.sh.fang.com/housing/24_5240_1_0_0_0_1_0_0_0/', 'http://esf.sh.fang.com/housing/24_5239_1_0_0_0_1_0_0_0/', 'http://esf.sh.fang.com/housing/22_1625_1_0_0_0_1_0_0_0/' 'http://esf.hz.fang.com/housing/149__1_0_0_0_1_0_0_0/', 'http://esf.hz.fang.com/housing/150__1_0_0_0_1_0_0_0/', 'http://esf.hz.fang.com/housing/153__1_0_0_0_1_0_0_0/', 'http://esf.suzhou.fang.com/housing/13102__1_0_0_0_1_0_0_0/', 'http://esf.suzhou.fang.com/housing/278_4008_1_0_0_0_1_0_0_0/', 'http://esf.suzhou.fang.com/housing/277__1_0_0_0_1_0_0_0/', 'http://esf.nb.fang.com/housing/162_4220_1_0_0_0_1_0_0_0/', 'http://esf.nb.fang.com/housing/162_13968_1_0_0_0_1_0_0_0/', 'http://esf.nb.fang.com/housing/1047_13973_1_0_0_0_1_0_0_0/', 'http://esf.nb.fang.com/housing/1047_17420_1_0_0_0_1_0_0_0/', 'http://esf.nb.fang.com/housing/164__1_0_0_0_1_0_0_0/' ] # start_urls = ['http://huanqiuguangchang2.fang.com/xiangqing/'] # rules = ( # Rule(link_extractor=r'http://[.]+(\.)fang.com/xiangqing', callback='parse_item') # ) def parse(self, response): # item = FTXCommunityItem.handle_response(response) # yield item try: for href in response.xpath(r'//a[@class="plotTit"]/@href'): url = href.extract() # type: str if not url.startswith('http'): url = parse.urljoin(get_base_url(response), url) yield scrapy.Request(url, callback=self.parse_subweb, dont_filter=True) except Exception as err: send_email('fangtianxia lv 1 web parse error', response._url + '\n' + traceback.format_exc()) msg = 'lv 1 web parse error url: ' + response._url + '-'.join(err.args) logging.error(msg=msg) try: next_page = response.xpath(r'//div[@class="fanye gray6"]/a[@id="PageControl1_hlk_next"]/@href').extract_first() if next_page: base_url = get_base_url(response) full_url = parse.urljoin(base_url, next_page) yield scrapy.Request(full_url, callback=self.parse, dont_filter=True) except Exception as err: send_email('fangtianxia next page url parse error', response._url + '\n' + traceback.format_exc()) msg = 'next page url parse error url: ' + response._url + '-'.join(err.args) logging.error(msg=msg) def parse_subweb(self, response): try: url = response.xpath(r'//li[@data="xqxq"]/a/@href').extract_first() yield scrapy.Request(url, callback=self.parse_item, dont_filter=True) except Exception as err: send_email('fangtianxia get detail url error', response._url + '\n' + traceback.format_exc()) msg = 'get detail url error url: ' + response._url + '-'.join(err.args) logging.error(msg=msg) def parse_item(self, response): try: item = FTXCommunityItem.handle_response(response) yield item except Exception as err: send_email('fangtianxia lv 2 web parse error', response._url + '\n' + traceback.format_exc()) msg = 'lv 2 web parse error url: ' + response._url + '-'.join(err.args) logging.error(msg=msg)