1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- # -*- coding:utf-8 -*-
- # @Time : 2018/4/27 10:50 AM
- # @Author : Swing
- import scrapy
- from elabSpider.items import ResoldHouseItem
- import logging
- import traceback
- from elabSpider.email_util import send_email
- class RentalHouseSpider(scrapy.Spider):
- name = 'nbresoldHouse'
- allowed_domains = [
- 'nb.anjuke.com'
- ]
- start_urls = [
- 'https://nb.anjuke.com/sale/o5-t105/',
- 'https://nb.anjuke.com/sale/o5-t107/'
- # 'https://nb.anjuke.com/prop/view/A1237992888?from=filter&spread=filtersearch_p&position=117&kwtype=filter&now_time=1526637680'
- ]
- def parse(self, response):
- try:
- community_list = response.xpath('//ul[@id="houselist-mod-new"]/li/div[@class="house-details"]/div[@class="house-title"]/a/@href').extract()
- if community_list:
- for community_url in community_list:
- yield scrapy.Request(community_url, callback=self.parse_item)
- except Exception as err:
- send_email('nbresoldHouse get detail url error', response._url + '\n' + traceback.format_exc())
- logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
- try:
- next_page = response.xpath(r'//div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
- if next_page:
- yield scrapy.Request(next_page, callback=self.parse)
- except Exception as err:
- send_email('nbresoldHouse get next page url error', response._url + '\n' + traceback.format_exc())
- logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
- def parse_item(self, response):
- try:
- item = ResoldHouseItem.handle_response(response)
- yield item
- except Exception as err:
- send_email('nbresoldHouse parse response error', response._url + '\n' + traceback.format_exc())
- logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))
|