1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- # -*- coding: utf-8 -*-
- from elabSpider.items import *
- from scrapy.utils.response import get_base_url
- from urllib import parse
- import logging
- import scrapy.core.engine
- from elabSpider.email_util import send_email
- import traceback
- class ExampleSpider(scrapy.Spider):
- name = 'fangtianxia'
- allowed_domains = ['fang.com/']
- start_urls = [
- 'http://esf.sh.fang.com/housing/25_1633_1_0_0_0_1_0_0_0/',
- 'http://esf.sh.fang.com/housing/19_103_1_0_0_0_1_0_0_0/',
- 'http://esf.sh.fang.com/housing/21_1622_1_0_0_0_1_0_0_0/',
- 'http://esf.sh.fang.com/housing/21_1623_1_0_0_0_1_0_0_0/',
- 'http://esf.sh.fang.com/housing/24_5240_1_0_0_0_1_0_0_0/',
- 'http://esf.sh.fang.com/housing/24_5239_1_0_0_0_1_0_0_0/',
- 'http://esf.sh.fang.com/housing/22_1625_1_0_0_0_1_0_0_0/'
- 'http://esf.hz.fang.com/housing/149__1_0_0_0_1_0_0_0/',
- 'http://esf.hz.fang.com/housing/150__1_0_0_0_1_0_0_0/',
- 'http://esf.hz.fang.com/housing/153__1_0_0_0_1_0_0_0/',
- 'http://esf.suzhou.fang.com/housing/13102__1_0_0_0_1_0_0_0/',
- 'http://esf.suzhou.fang.com/housing/278_4008_1_0_0_0_1_0_0_0/',
- 'http://esf.suzhou.fang.com/housing/277__1_0_0_0_1_0_0_0/',
- 'http://esf.nb.fang.com/housing/162_4220_1_0_0_0_1_0_0_0/',
- 'http://esf.nb.fang.com/housing/162_13968_1_0_0_0_1_0_0_0/',
- 'http://esf.nb.fang.com/housing/1047_13973_1_0_0_0_1_0_0_0/',
- 'http://esf.nb.fang.com/housing/1047_17420_1_0_0_0_1_0_0_0/',
- 'http://esf.nb.fang.com/housing/164__1_0_0_0_1_0_0_0/'
- ]
- # start_urls = ['http://huanqiuguangchang2.fang.com/xiangqing/']
- # rules = (
- # Rule(link_extractor=r'http://[.]+(\.)fang.com/xiangqing', callback='parse_item')
- # )
- def parse(self, response):
- # item = FTXCommunityItem.handle_response(response)
- # yield item
- try:
- for href in response.xpath(r'//a[@class="plotTit"]/@href'):
- url = href.extract() # type: str
- if not url.startswith('http'):
- url = parse.urljoin(get_base_url(response), url)
- yield scrapy.Request(url, callback=self.parse_subweb, dont_filter=True)
- except Exception as err:
- send_email('fangtianxia lv 1 web parse error', response._url + '\n' + traceback.format_exc())
- msg = 'lv 1 web parse error url: ' + response._url + '-'.join(err.args)
- logging.error(msg=msg)
- try:
- next_page = response.xpath(r'//div[@class="fanye gray6"]/a[@id="PageControl1_hlk_next"]/@href').extract_first()
- if next_page:
- base_url = get_base_url(response)
- full_url = parse.urljoin(base_url, next_page)
- yield scrapy.Request(full_url, callback=self.parse, dont_filter=True)
- except Exception as err:
- send_email('fangtianxia next page url parse error', response._url + '\n' + traceback.format_exc())
- msg = 'next page url parse error url: ' + response._url + '-'.join(err.args)
- logging.error(msg=msg)
- def parse_subweb(self, response):
- try:
- url = response.xpath(r'//li[@data="xqxq"]/a/@href').extract_first()
- yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
- except Exception as err:
- send_email('fangtianxia get detail url error', response._url + '\n' + traceback.format_exc())
- msg = 'get detail url error url: ' + response._url + '-'.join(err.args)
- logging.error(msg=msg)
- def parse_item(self, response):
- try:
- item = FTXCommunityItem.handle_response(response)
- yield item
- except Exception as err:
- send_email('fangtianxia lv 2 web parse error', response._url + '\n' + traceback.format_exc())
- msg = 'lv 2 web parse error url: ' + response._url + '-'.join(err.args)
- logging.error(msg=msg)
|