fangtianxiacommunity.py 3.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # -*- coding: utf-8 -*-
  2. from elabSpider.items import *
  3. from scrapy.utils.response import get_base_url
  4. from urllib import parse
  5. import logging
  6. import scrapy.core.engine
  7. from elabSpider.email_util import send_email
  8. import traceback
  9. class ExampleSpider(scrapy.Spider):
  10. name = 'fangtianxia'
  11. allowed_domains = ['fang.com/']
  12. start_urls = [
  13. 'http://esf.sh.fang.com/housing/25_1633_1_0_0_0_1_0_0_0/',
  14. 'http://esf.sh.fang.com/housing/19_103_1_0_0_0_1_0_0_0/',
  15. 'http://esf.sh.fang.com/housing/21_1622_1_0_0_0_1_0_0_0/',
  16. 'http://esf.sh.fang.com/housing/21_1623_1_0_0_0_1_0_0_0/',
  17. 'http://esf.sh.fang.com/housing/24_5240_1_0_0_0_1_0_0_0/',
  18. 'http://esf.sh.fang.com/housing/24_5239_1_0_0_0_1_0_0_0/',
  19. 'http://esf.sh.fang.com/housing/22_1625_1_0_0_0_1_0_0_0/'
  20. 'http://esf.hz.fang.com/housing/149__1_0_0_0_1_0_0_0/',
  21. 'http://esf.hz.fang.com/housing/150__1_0_0_0_1_0_0_0/',
  22. 'http://esf.hz.fang.com/housing/153__1_0_0_0_1_0_0_0/',
  23. 'http://esf.suzhou.fang.com/housing/13102__1_0_0_0_1_0_0_0/',
  24. 'http://esf.suzhou.fang.com/housing/278_4008_1_0_0_0_1_0_0_0/',
  25. 'http://esf.suzhou.fang.com/housing/277__1_0_0_0_1_0_0_0/',
  26. 'http://esf.nb.fang.com/housing/162_4220_1_0_0_0_1_0_0_0/',
  27. 'http://esf.nb.fang.com/housing/162_13968_1_0_0_0_1_0_0_0/',
  28. 'http://esf.nb.fang.com/housing/1047_13973_1_0_0_0_1_0_0_0/',
  29. 'http://esf.nb.fang.com/housing/1047_17420_1_0_0_0_1_0_0_0/',
  30. 'http://esf.nb.fang.com/housing/164__1_0_0_0_1_0_0_0/'
  31. ]
  32. # start_urls = ['http://huanqiuguangchang2.fang.com/xiangqing/']
  33. # rules = (
  34. # Rule(link_extractor=r'http://[.]+(\.)fang.com/xiangqing', callback='parse_item')
  35. # )
  36. def parse(self, response):
  37. # item = FTXCommunityItem.handle_response(response)
  38. # yield item
  39. try:
  40. for href in response.xpath(r'//a[@class="plotTit"]/@href'):
  41. url = href.extract() # type: str
  42. if not url.startswith('http'):
  43. url = parse.urljoin(get_base_url(response), url)
  44. yield scrapy.Request(url, callback=self.parse_subweb, dont_filter=True)
  45. except Exception as err:
  46. send_email('fangtianxia lv 1 web parse error', response._url + '\n' + traceback.format_exc())
  47. msg = 'lv 1 web parse error url: ' + response._url + '-'.join(err.args)
  48. logging.error(msg=msg)
  49. try:
  50. next_page = response.xpath(r'//div[@class="fanye gray6"]/a[@id="PageControl1_hlk_next"]/@href').extract_first()
  51. if next_page:
  52. base_url = get_base_url(response)
  53. full_url = parse.urljoin(base_url, next_page)
  54. yield scrapy.Request(full_url, callback=self.parse, dont_filter=True)
  55. except Exception as err:
  56. send_email('fangtianxia next page url parse error', response._url + '\n' + traceback.format_exc())
  57. msg = 'next page url parse error url: ' + response._url + '-'.join(err.args)
  58. logging.error(msg=msg)
  59. def parse_subweb(self, response):
  60. try:
  61. url = response.xpath(r'//li[@data="xqxq"]/a/@href').extract_first()
  62. yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
  63. except Exception as err:
  64. send_email('fangtianxia get detail url error', response._url + '\n' + traceback.format_exc())
  65. msg = 'get detail url error url: ' + response._url + '-'.join(err.args)
  66. logging.error(msg=msg)
  67. def parse_item(self, response):
  68. try:
  69. item = FTXCommunityItem.handle_response(response)
  70. yield item
  71. except Exception as err:
  72. send_email('fangtianxia lv 2 web parse error', response._url + '\n' + traceback.format_exc())
  73. msg = 'lv 2 web parse error url: ' + response._url + '-'.join(err.args)
  74. logging.error(msg=msg)