zhaojh il y a 6 ans
commit
e7a8d3f789

+ 339 - 0
LICENSE

@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc., [http://fsf.org/]
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    {description}
+    Copyright (C) 2018  剑鸣
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  {signature of Ty Coon}, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.

+ 39 - 0
README.md

@@ -0,0 +1,39 @@
+# elabSpider
+
+#### 项目介绍
+{**以下是码云平台说明,您可以替换为您的项目简介**
+码云是开源中国推出的基于 Git 的代码托管平台(同时支持 SVN)。专为开发者提供稳定、高效、安全的云端软件开发协作平台
+无论是个人、团队、或是企业,都能够用码云实现代码托管、项目管理、协作开发。企业项目请看 [https://gitee.com/enterprises](https://gitee.com/enterprises)}
+
+#### 软件架构
+软件架构说明
+
+
+#### 安装教程
+
+1. xxxx
+2. xxxx
+3. xxxx
+
+#### 使用说明
+
+1. xxxx
+2. xxxx
+3. xxxx
+
+#### 参与贡献
+
+1. Fork 本项目
+2. 新建 Feat_xxx 分支
+3. 提交代码
+4. 新建 Pull Request
+
+
+#### 码云特技
+
+1. 使用 Readme\_XXX.md 来支持不同的语言,例如 Readme\_en.md, Readme\_zh.md
+2. 码云官方博客 [blog.gitee.com](https://blog.gitee.com)
+3. 你可以 [https://gitee.com/explore](https://gitee.com/explore) 这个地址来了解码云上的优秀开源项目
+4. [GVP](https://gitee.com/gvp) 全称是码云最有价值开源项目,是码云综合评定出的优秀开源项目
+5. 码云官方提供的使用手册 [http://git.mydoc.io/](http://git.mydoc.io/)
+6. 码云封面人物是一档用来展示码云会员风采的栏目 [https://gitee.com/gitee-stars/](https://gitee.com/gitee-stars/)

+ 0 - 0
elabSpider/__init__.py


+ 38 - 0
elabSpider/email_util.py

@@ -0,0 +1,38 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/5/23 11:49 AM
+
+# @Author  : Swing
+
+
+import smtplib
+from email.mime.text import MIMEText
+from email.header import Header
+import traceback
+
+mail_host = 'smtp.exmail.qq.com'
+mail_user = 'zhaojh@elab-plus.com'
+mail_pass = 'Elab@123'
+
+sender = 'zhaojh@elab-plus.com'
+receivers = ['zhaojh@elab-plus.com']
+
+
+def send_email(title, content):
+    message = MIMEText(content, 'plain', 'utf-8')
+    message['From'] = Header("Tornado service", 'utf-8')
+    message['To'] = Header("Admin", 'utf-8')
+    message['subject'] = Header(title, 'utf-8')
+
+    try:
+        smtp_obj = smtplib.SMTP()
+        smtp_obj.connect(mail_host, 25)
+        smtp_obj.login(mail_user, mail_pass)
+        smtp_obj.sendmail(sender, receivers, message.as_string())
+        print('Mail sent successfully')
+    except smtplib.SMTPException:
+        print('Error: Mail send failed' + traceback.format_exc())
+
+
+# 测试代码
+# send_email("我是测试标题", "我是测试内容")

Fichier diff supprimé car celui-ci est trop grand
+ 1 - 0
elabSpider/fake_useragent.json


+ 496 - 0
elabSpider/items.py

@@ -0,0 +1,496 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy.selector import Selector
+import scrapy
+import re
+import time
+import logging
+
+
+class ResoldApartmentItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    unit_price = scrapy.Field()
+    total_price = scrapy.Field()
+    orientation = scrapy.Field()
+    area = scrapy.Field()
+    built_year = scrapy.Field()
+    property = scrapy.Field()
+    decoration = scrapy.Field()
+    model = scrapy.Field()
+    floor = scrapy.Field()
+    image = scrapy.Field()
+    house_type = scrapy.Field()
+    trading_ownership = scrapy.Field()
+    tag = scrapy.Field()
+    location = scrapy.Field()
+    longitude = scrapy.Field()
+    latitude = scrapy.Field()
+    page_url = scrapy.Field()
+    house_id = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response):
+        item = cls()
+        item['title'] = response.xpath(
+            '//ul[@class="house-basic-item3"]/li[1]/span[@class="c_000 mr_10"]/a[1]/text()').extract_first().strip()
+
+        unit_price_string = response.xpath('//span[@class="unit"]/text()').extract_first()
+        item['unit_price'] = re.search(r'[1-9][\d]*', unit_price_string).group()
+
+        # 处理总价
+        total_price_string = response.xpath('//span[@class="price"]/text()').extract_first()
+        total_price = re.search('[0-9]+(\.)?[0-9]*', total_price_string).group()
+        price_unit = response.xpath(r'//span[@class="price"]/b/text()', )
+        if price_unit == '万':
+            total_price = str(int(total_price) * 10000)
+        elif price_unit == '千':
+            total_price = str(int(total_price) * 1000)
+        item['total_price'] = total_price
+
+        item['orientation'] = response.xpath('//p[@class="toward"]/span[@class="main"]/text()').extract_first()
+
+        # 建筑年代
+        built_year_str = response.xpath(r'//p[@class="toward"]/span[@class="sub"]/text()').extract_first()
+        if built_year_str:
+            item['built_year'] = re.search(r'[\d]*', built_year_str).group()
+
+        area_string = response.xpath(
+            '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first()
+        item['area'] = re.search(r'[\d]+', area_string).group()
+
+        item['property'] = response.xpath(
+            '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[3]/span[2]/text()').re(r'[\d]+')[0]
+        item['decoration'] = response.xpath(
+            '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[2]/span[2]/text()').extract_first()
+        item['model'] = response.xpath(
+            '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first()
+        item['floor'] = response.xpath(
+            '//div[@id="generalSituation"]/div[@class="general-item-wrap"]/ul[@class="general-item-right"]/li[1]/span[2]/text()').extract_first()
+        item['house_type'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[2]/span[2]/text()').extract_first()
+        item['trading_ownership'] = response.xpath('//div[@id="generalExpense"]/div[@class="general-item-wrap"]/ul[@class="general-item-left"]/li[3]/span[2]/text()').extract_first()
+        # 图片
+        item['image'] = response.xpath(
+            r'//div[@class="basic-pic-list pr"]/ul[@id="leftImg"]/li[1]/img/@data-value').extract_first()
+
+        # 位置
+        location_list = response.xpath(r'//ul[@class="house-basic-item3"]/li[2]/span[2]/a/text()').extract()
+        if location_list:
+            location_str = '-'.join(location_list)
+            item['location'] = location_str
+
+        # 标签
+        tag_list = response.xpath(r'//p[@class="house-update-info"]/span[@class="ts"]/text()').extract()
+        if tag_list:
+            item['tag'] = tag_list[0]
+
+        # 经纬度
+        script_string = response.xpath(r'//script[@type="text/javascript"]').extract_first()
+        latitude_math = re.search(r'"lat":([1-9])[\d](\.)[\d]*,"', script_string).group()
+        longitude_math = re.search(r'"lon":[1-9][\d][\d](\.)[\d]*,"', script_string).group()
+        item['latitude'] = re.search(r'([1-9])[\d](\.)[\d]*', latitude_math).group()
+        item['longitude'] = re.search(r'[1-9][\d][\d](\.)[\d]*', longitude_math).group()
+
+        page_url = response._url
+        item['page_url'] = page_url
+
+        item['house_id'] = '109'
+
+        return item
+
+
+class CommunityItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    unit_price = scrapy.Field()
+    floating_rate = scrapy.Field()
+    built_year = scrapy.Field()
+    location = scrapy.Field()
+    page_url = scrapy.Field()
+    type = scrapy.Field()
+    house_id = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response, type):
+        item = cls()
+        selector = Selector(text=response)
+        item['title'] = selector.xpath(r'//h3/a/@title').extract_first()
+        item['unit_price'] = selector.xpath(r'//div[@class="li-side"]/p/strong/text()').extract_first()
+        floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt"]/text()').extract_first()
+        if not floating_rate:
+            floating_rate = selector.xpath(r'//div[@class="li-side"]/p[@class="price-txt price-down"]/text()').extract_first()
+        item['floating_rate'] = floating_rate
+        item['location'] = selector.xpath(r'//div[@class="li-info"]/address/text()').extract_first().strip()
+        item['page_url'] = selector.xpath(r'//div[@_soj="xqlb"]/@link').extract_first().strip()
+        item['built_year'] = selector.xpath(r'//p[@class="date"]/text()').extract_first().strip()
+        item['type'] = type
+        item['house_id'] = '109'
+        return item
+
+
+class FTXCommunityItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    unit_price = scrapy.Field()
+    floating_rate = scrapy.Field()
+    year_floating_tare = scrapy.Field()
+    built_year = scrapy.Field()
+    property = scrapy.Field()
+    property_type = scrapy.Field()
+    building_type = scrapy.Field()
+    greening_rate = scrapy.Field()
+    plot_ratio = scrapy.Field()
+    total_area = scrapy.Field()
+    building_area = scrapy.Field()
+    construction = scrapy.Field()
+    location = scrapy.Field()
+    region = scrapy.Field()
+    page_url = scrapy.Field()
+    img_url = scrapy.Field()
+    predict_type = scrapy.Field()
+    house_id = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response):
+        item = cls()
+        title_string: str = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract_first()
+        if title_string:
+            item['title'] = title_string.replace('小区网', '')
+        item['unit_price'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[1]/dd/span/text()').extract_first()
+        item['floating_rate'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[2]/dd/span/text()').extract_first()
+        item['year_floating_tare'] = response.xpath(r'//div[@class="box detaiLtop mt20 clearfix"]/dl[3]/dd/span/text()').extract_first()
+
+        item['location'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="小区地址:"]/text()').extract_first()
+        item['region'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="所属区域:"]/text()').extract_first()
+        property_string = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="产权描述:"]/text()').extract_first()
+        item['property'] = None
+        if property_string:
+            re_list = re.search(r'[\d]{1,2}', property_string)
+            if re_list:
+                item['property'] = re_list.group(0)
+
+        item['property_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="物业类别:"]/text()').extract_first()
+        if not item['property']:
+            item['predict_type'] = '其他'
+        elif item['property'] == '70':
+            item['predict_type'] = '住宅'
+        else:
+            item['predict_type'] = '公寓'
+
+        item['construction'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑结构:"]/span/text()').extract_first()
+        item['built_year'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑年代:"]/text()').extract_first()
+        item['building_type'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑类型:"]/text()').extract_first()
+        item['greening_rate'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="绿 化 率:"]/text()').extract_first()
+        item['plot_ratio'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="容 积 率:"]/text()').extract_first()
+        item['total_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="占地面积:"]/text()').extract_first()
+        item['building_area'] = response.xpath(r'//div[@class="box"][1]/div[@class="inforwrap clearfix"]/dl/dd[strong="建筑面积:"]/text()').extract_first()
+        item['img_url'] = response.xpath(r'//div[@class="logoBox_sq"]/div[@class="logopic_sq"]/a/img/@src').extract_first()
+        item['page_url'] = response._url
+        item['house_id'] = '109'
+
+        return item
+
+
+class RentalHouseItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    location = scrapy.Field()
+    price = scrapy.Field()
+    house_type = scrapy.Field()
+    area = scrapy.Field()
+    orientation = scrapy.Field()
+    floor = scrapy.Field()
+    decoration = scrapy.Field()
+    property_type = scrapy.Field()
+    house_code = scrapy.Field()
+    publish_date = scrapy.Field()
+    longitude = scrapy.Field()
+    latitude = scrapy.Field()
+    img_url = scrapy.Field()
+    page_url = scrapy.Field()
+    date = scrapy.Field()
+    coordinate = scrapy.Field()
+    house_id = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response):
+        item = cls()
+        name_list: list = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="小区:"]/a/text()').extract()
+        location_string = ''
+        if name_list:
+            if len(name_list) > 0:
+                item['title'] = name_list[0]
+            if len(name_list) > 1:
+                location_string += name_list[1]
+                location_string += "-"
+            if len(name_list) > 2:
+                location_string += name_list[2]
+        item['location'] = location_string
+        price_list = response.xpath(r'//li[@class="full-line cf"]/span[@class="price"]//text()').extract()
+        if price_list:
+            item['price'] = "".join(price_list)
+        item['house_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="户型:"]/span[@class="info"]/text()').extract_first()
+        item['area'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="面积:"]/span[@class="info"]/text()').extract_first()
+        item['orientation'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="朝向:"]/span[@class="info"]/text()').extract_first()
+        item['floor'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="楼层:"]/span[@class="info"]/text()').extract_first()
+        item['decoration'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="装修:"]/span[@class="info"]/text()').extract_first()
+        item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first()
+
+        house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first()
+        if house_info_string:
+            code_match = re.search(r'[\d]{6,}', house_info_string)
+            if code_match:
+                item['house_code'] = code_match.group()
+
+            date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_info_string)
+            if date_match:
+                item['publish_date'] = date_match.group()
+
+        longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,')
+        if longitude_match:
+            item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
+
+        latitude_match = Selector(response).re(r'lat:[\d]{0,2}[\.][\d]*,')
+        if latitude_match:
+            item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
+
+        if longitude_match and latitude_match:
+            try:
+                item['coordinate'] = [float(item['longitude']), float(item['latitude'])]
+            except Exception as err:
+                logging.error('type conversion error ! reason: ' + '-'.join(err.args))
+
+        item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img[1]/@data-src').extract_first()
+        item['page_url'] = response._url
+        item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
+        item['house_id'] = '109'
+
+        return item
+
+
+class FTXRentalHouseItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    location = scrapy.Field()
+    price = scrapy.Field()
+    house_type = scrapy.Field()
+    area = scrapy.Field()
+    orientation = scrapy.Field()
+    floor = scrapy.Field()
+    decoration = scrapy.Field()
+    # property_type = scrapy.Field()
+    house_code = scrapy.Field()
+    update_date = scrapy.Field()
+    # longitude = scrapy.Field()
+    # latitude = scrapy.Field()
+    img_url = scrapy.Field()
+    page_url = scrapy.Field()
+    date = scrapy.Field()
+    house_id = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response):
+        item = cls()
+
+        house_info_match = Selector(response).re(r'var houseInfo = {[\s\S]*};')
+        if house_info_match:
+            info_str = house_info_match[0]
+            title_match = re.search(r"projname: '[\s\S]*?',", info_str)
+            if title_match:
+                title_str = title_match[0]
+                title_str = title_str.replace("projname: '", '')
+                title_str = title_str.replace("',", '')
+                item['title'] = title_str
+
+            district_math = re.search(r"district: '[\s\S]*?',", info_str)
+            location_string = ''
+            if district_math:
+                district_str = district_math[0]
+                district_str = district_str.replace("district: '", '')
+                district_str = district_str.replace("',", '')
+                location_string += district_str
+                location_string += '-'
+
+            comarea_math = re.search(r"comarea: '[\s\S]*?',", info_str)
+            if comarea_math:
+                comarea_str = comarea_math[0]
+                comarea_str = comarea_str.replace("comarea: '", '')
+                comarea_str = comarea_str.replace("',", '')
+                location_string += comarea_str
+
+            item['location'] = location_string
+
+
+        # name_list: list = response.xpath(r'//div[div[@class="lab"][text()="小      区"]]/div[contains(@class, "rcont")]/a/text()').extract()
+        # location_string = ''
+        # if name_list:
+        #     if len(name_list) > 0:
+        #         item['title'] = name_list[0]
+        #     if len(name_list) > 1:
+        #         location_string += name_list[1]
+        #         location_string += "-"
+        #     if len(name_list) > 2:
+        #         location_string += name_list[2]
+        # item['location'] = location_string
+        price_list = response.xpath(r'//div[@class ="tab-cont-right"]/div[@class ="tr-line clearfix zf_new_title"]/div[contains(@class, "trl-item sty1")]//text()').extract()
+        if price_list:
+            item['price'] = "".join(price_list).strip()
+        item['house_type'] = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"]="户型"]/div[@class="tt"]/text()').extract_first()
+        item['area'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="建筑面积"]/div[@class="tt"]/text()').extract_first()
+        item['orientation'] = response.xpath(r'//div[@class="trl-item1 w146"][div[@class="font14"]="朝向"]/div[@class="tt"]/text()').extract_first()
+
+        floor_list = response.xpath(r'//div[@class="trl-item1 w182"][div[@class="font14"][contains(text(), "楼层")]]/div//text()').extract()
+        if floor_list:
+            floor_str = '-'.join(floor_list)
+            item['floor'] = floor_str
+
+        item['decoration'] = response.xpath(r'//div[@class="trl-item1 w132"][div[@class="font14"]="装修"]/div[@class="tt"]/text()').extract_first()
+        # item['property_type'] = response.xpath(r'//ul[@class="house-info-zufang cf"]/li[span="类型:"]/span[@class="info"]/text()').extract_first()
+
+        # house_info_string = response.xpath(r'//div[@class="mod-title bottomed"][h3="房屋信息"]/div/text()').extract_first()
+        house_code_string = response.xpath(r'//span[contains(text(), "房源编号")]/text()').extract_first()
+        if house_code_string:
+            code_match = re.search(r'[\d]{6,}', house_code_string)
+            if code_match:
+                item['house_code'] = code_match.group()
+        house_date_string = response.xpath(r'//span[contains(text(), "更新时间")]/text()').extract_first()
+        if house_code_string:
+            date_match = re.search(r'[\d]{0,4}-[\d]{0,2}-[\d]{0,2}', house_date_string)
+            if date_match:
+                item['update_date'] = date_match.group()
+
+        # longitude_match = Selector(response).re(r'lng:[\d]{0,3}[\.][\d]*,')
+        # if longitude_match:
+        #     item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
+        #
+        # latitude_match = Selector(response).re(r'[\d]{0,2}[\.][\d]*,')
+        # if latitude_match:
+        #     item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
+
+        item['img_url'] = response.xpath(r'//div[@class="bigImg"]/img[1]/@src').extract_first()
+        item['page_url'] = response._url
+        item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
+        item['house_id'] = '109'
+
+        return item
+
+
+class ResoldHouseItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    location = scrapy.Field()
+    price = scrapy.Field()
+    house_type = scrapy.Field()
+    area = scrapy.Field()
+    orientation = scrapy.Field()
+    floor = scrapy.Field()
+    decoration = scrapy.Field()
+    property_type = scrapy.Field()
+    total_price = scrapy.Field()
+    down_payment = scrapy.Field()
+    monthly_payment = scrapy.Field()
+    house_code = scrapy.Field()
+    publish_date = scrapy.Field()
+    house_price_info = scrapy.Field()
+    community_price_info = scrapy.Field()
+    area_price_info = scrapy.Field()
+    longitude = scrapy.Field()
+    latitude = scrapy.Field()
+    img_url = scrapy.Field()
+    page_url = scrapy.Field()
+    date = scrapy.Field()
+    coordinate = scrapy.Field()
+    house_id = scrapy.Field()
+    build_year = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response):
+        item = cls()
+
+        item['title'] = response.xpath(r'//li[div[text()="所属小区:"]]/div/a/text()').extract_first()
+        location_list = response.xpath(r'//li[div[text()="所在位置:"]]/div/p//text()').extract()
+        if location_list:
+            location = ''.join(location_list).strip()
+            location = location.replace('\n', '')
+            location = location.replace('\t', '')
+            item['location'] = location
+
+        item['price'] = response.xpath(r'//li[div[text()="房屋单价:"]]/div[2]/text()').extract_first()
+        house_type_string: str = response.xpath(r'normalize-space(//li[div[text()="房屋户型:"]]/div[2]/text())').extract_first()
+        if house_type_string:
+            item['house_type'] = house_type_string.replace(' ', '')
+
+        item['area'] = response.xpath(r'//li[div[text()="建筑面积:"]]/div[2]/text()').extract_first()
+        item['orientation'] = response.xpath(r'//li[div[text()="房屋朝向:"]]/div[2]/text()').extract_first()
+        item['floor'] = response.xpath(r'normalize-space(//li[div[text()="所在楼层:"]]/div[2]/text())').extract_first()
+        item['decoration'] = response.xpath(r'normalize-space(//li[div[text()="装修程度:"]]/div[2]/text())').extract_first()
+        item['property_type'] = response.xpath(r'normalize-space(//li[div[text()="房屋类型:"]]/div[2]/text())').extract_first()
+        item['total_price'] = response.xpath(r'//div[@class="wrapper"]/div[@class="wrapper-lf clearfix"]/div[@class="basic-info clearfix"]/span[1]/em/text()').extract_first()
+        item['down_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考首付:"]]/div[2]/text())').extract_first()
+        item['monthly_payment'] = response.xpath(r'normalize-space(//li[div[text()="参考月供:"]]/div/span/text())').extract_first()
+        item['build_year'] = response.xpath(r'normalize-space(//li[div[text()="建造年代:"]]/div[2]/text())').extract_first()
+
+        house_code_string = response.xpath(r'//span[contains(text(), "房屋编码")]/text()').extract_first()
+        if house_code_string:
+            code_match = re.search(r'[\d]{6,}', house_code_string)
+            if code_match:
+                item['house_code'] = code_match.group()
+        house_date_string = response.xpath(r'//span[contains(text(), "发布时间")]/text()').extract_first()
+        if house_code_string:
+            date_match = re.search(r'[\d]{0,4}年[\d]{0,2}月[\d]{0,2}日', house_date_string)
+            if date_match:
+                item['publish_date'] = date_match.group()
+
+        longitude_match = Selector(response).re(r'lng : "[\d]{0,3}[\.][\d]*"')
+        if longitude_match:
+            item['longitude'] = re.search(r'[\d]{0,3}[\.][\d]*', longitude_match[0]).group()
+
+        latitude_match = Selector(response).re(r'lat : "[\d]{0,2}[\.][\d]*"')
+        if latitude_match:
+            item['latitude'] = re.search(r'[\d]{0,2}[\.][\d]*', latitude_match[0]).group()
+
+        if longitude_match and latitude_match:
+            try:
+                item['coordinate'] = [float(item['longitude']), float(item['latitude'])]
+            except Exception as err:
+                logging.error('type conversion error ! reason: ' + '-'.join(err.args))
+
+        item['img_url'] = response.xpath(r'//div[@class="switch_list"][1]/div[@class="img_wrap"][1]/img/@data-src').extract_first()
+
+        item['page_url'] = response._url
+        item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
+        item['house_id'] = '109'
+
+        return item
+
+
+class LfsAveragePriceItem(scrapy.Item):
+    _id = scrapy.Field()
+    title = scrapy.Field()
+    price = scrapy.Field()
+    arrow = scrapy.Field()
+    rate = scrapy.Field()
+    page_url = scrapy.Field()
+    date = scrapy.Field()
+    house_id = scrapy.Field()
+
+    @classmethod
+    def handle_response(cls, response):
+        item = cls()
+
+        item['title'] = response.xpath(r'//div[@class="comm-title"]/a/@title').extract_first()
+        price_math = Selector(response).re(r'(?<="comm_midprice":")([0-9]*(?=","area_midprice))')
+        if price_math:
+            item['price'] = price_math[0]
+        # item['arrow'] = response.xpath(r'//i[@class="arrow"]/text()').extract_first()
+        # item['rate'] = response.xpath(r'normalize-space(//span[@class="status level"]/text())').extract_first()
+        item['page_url'] = response._url
+        item['date'] = time.strftime('%Y-%m-%d %H:%M:%S')
+        item['house_id'] = '109'
+
+        return item

+ 172 - 0
elabSpider/middlewares.py

@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy.downloadermiddlewares.retry import RetryMiddleware
+from scrapy.utils.response import response_status_message
+from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
+from scrapy import signals
+from scrapy.conf import settings
+import logging
+import time
+import fake_useragent
+
+
+class ElabspiderSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+
+class ElabspiderDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+
+        # proxy_user_pass = settings['PROXY_IDENTIFY'] + ':' + settings['PROXY_SECRETKEY']
+        # encoded_proxy_pass = base64.encodebytes(bytes(proxy_user_pass.encode(encoding='utf-8')))
+
+        #TODO: 开关代理
+        # logging.info(msg='process request url: ' + request._url)
+        # request.meta['proxy'] = settings['PROXY_HOST'] + ':' + settings['PROXY_PORT']
+        # request.headers['Proxy-Authorization'] = 'Basic ' + 'SDVQMDI5OU44MzBBQzlDRDo1MTZGOTVEMDNFQjFGMDI2'
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        logging.info(msg='receive response status url: ' + response._url + ' status: ' + str(response.status))
+        # if response.status != 200:
+        #     logging.debug('retry url: ' + response._url)
+        #     # proxy = self.get_random_proxy()
+        #     # request.meta['proxy'] = proxy
+        #     return request
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+    # def get_random_proxy(self):
+    #     while 1:
+    #         with open('proxies.txt', 'r') as f:
+    #             proxies = f.readlines()
+    #         if proxies:
+    #             break
+    #         else:
+    #             time.sleep(1)
+    #     proxy = random.choice(proxies).strip()
+    #     return proxy
+
+
+class TooManyRequestsRetryMiddleware(RetryMiddleware):
+
+    def __init__(self, crawler):
+        super(TooManyRequestsRetryMiddleware, self).__init__(crawler.settings)
+        self.crawler = crawler
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
+
+    def process_response(self, request, response, spider):
+        if request.meta.get('dont_retry', False):
+            return response
+        elif response.status == 429:
+            self.crawler.engine.pause()
+            time.sleep(10)  # If the rate limit is renewed in a minute, put 60 seconds, and so on.
+            self.crawler.engine.unpause()
+            reason = response_status_message(response.status)
+            return self._retry(request, reason, spider) or response
+        elif response.status in self.retry_http_codes:
+            reason = response_status_message(response.status)
+            return self._retry(request, reason, spider) or response
+        return response
+
+
+class UserAgent(UserAgentMiddleware):
+
+    def __init__(self, user_agent=''):
+        self.user_agent = user_agent
+
+    def process_request(self, request, spider):
+        # agent = random.choice(self.user_agent_list)
+        agent = fake_useragent.UserAgent(path=settings['USER_AGENT_PATH']).random
+        if agent:
+            # print("********Current UserAgent:%s************" % agent)
+            # log(level=logging.DEBUG, msg='Current UserAgent: ' + agent)
+            request.headers.setdefault('User-Agent', agent)
+

+ 55 - 0
elabSpider/pipelines.py

@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import pymongo
+from scrapy.conf import settings
+from elabSpider.items import *
+import urllib.parse
+
+
+class ElabspiderPipeline(object):
+
+    def __init__(self):
+        # self.client = pymongo.MongoClient(host='139.196.5.59', port=27017)
+        # self.client.admin.authenticate(name='dbuser', password='elab@123', mechanism='SCRAM-SHA-1')
+        # username = urllib.parse.quote_plus('dbuser')
+        # password = urllib.parse.quote_plus('elab@123')
+        # uri = "mongodb://dbuser:elab@123@example.com/?authSource=the_database&authMechanism=SCRAM-SHA-1"
+        # self.client = pymongo.MongoClient('101.132.106.154', authSource='logdb')
+
+        self.client = pymongo.MongoClient(settings['MONGO_HOST'], authSource='logdb')
+
+        self.db = self.client[settings['MONGO_DB']]
+        self.coll = self.db[settings['MONGO_COLL']]
+
+    def process_item(self, item, spider):
+        if isinstance(item, CommunityItem):
+            self.coll = self.db['departmengprice']
+        elif isinstance(item, FTXCommunityItem):
+            self.coll = self.db['ftxcommunity']
+        elif isinstance(item, ResoldApartmentItem):
+            self.coll = self.db[settings['MONGO_COLL']]
+        elif isinstance(item, RentalHouseItem):
+            if spider.name == 'lfsrentalHouse':
+                self.coll = self.db['lfs_rental_house']
+            elif spider.name == 'sjkrentalHouse':
+                self.coll = self.db['sjk_rental_house']
+            else:
+                self.coll = self.db['rental_house']
+        elif isinstance(item, FTXRentalHouseItem):
+            self.coll = self.db['ftx_rental_house']
+        elif isinstance(item, ResoldHouseItem):
+            if spider.name == 'sjkresoldHouse':
+                self.coll = self.db['sjk_resold_house']
+            elif spider.name == 'lfsresoldHouse':
+                self.coll = self.db['lfs_resold_house']
+            else:
+                self.coll = self.db['nb_resold_house']
+        elif isinstance(item, LfsAveragePriceItem):
+            self.coll = self.db['lfs_average_price']
+        self.coll.insert_one(item)
+        return item

+ 106 - 0
elabSpider/proxies.py

@@ -0,0 +1,106 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/19 3:47 PM
+
+# @Author  : Swing
+
+
+from bs4 import BeautifulSoup
+import lxml
+from multiprocessing import Process, Queue
+import random
+import json
+import time
+import requests
+
+
+class Proxies(object):
+
+    def __init__(self, page=3):
+        self.profies = []
+        self.verify_pro = []
+        self.page = page
+        self.headers = {
+            'Accept': '*/*',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
+            'Accept-Encoding': 'gzip, deflate, sdch',
+            'Accept-Language': 'zh-CN,zh;q=0.8'
+        }
+
+        self.get_proxies()
+        self.get_proxies_nn()
+
+    def get_proxies(self):
+        page = random.randint(1, 10)
+        page_stop = page + self.page
+        while page < page_stop:
+            url = 'http://www.xicidaili.com/nt/%d' % page
+            html = requests.get(url, headers=self.headers).content
+            soup = BeautifulSoup(html, 'lxml')
+            ip_list = soup.find(id='ip_list')
+            for odd in ip_list.find_all(class_='odd'):
+                protocol = odd.find_all('td')[5].get_text().lower() + '://'
+                self.profies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))
+            page += 1
+
+    def get_proxies_nn(self):
+        page = random.randint(1, 10)
+        page_stop = page + self.page
+        while page < page_stop:
+            url = 'http://www.xicidaili.com/nn/%d' % page
+            html = requests.get(url, headers=self.headers).content
+            soup = BeautifulSoup(html, 'lxml')
+            ip_list = soup.find(id='ip_list')
+            for odd in ip_list.find_all(class_='odd'):
+                protocol = odd.find_all('td')[5].get_text().lower() + '://'
+                self.profies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))
+            page += 1
+
+    def verify_proxies(self):
+        # 没验证的代理
+        old_queue = Queue()
+        # 验证后的代理
+        new_queue = Queue()
+        print('verify proxy......')
+        works = []
+        for i in range(15):
+            works.append(Process(target=self.verify_one_proxy, args=(old_queue, new_queue)))
+        for work in works:
+            work.start()
+        for proxy in self.profies:
+            old_queue.put(proxy)
+        for work in works:
+            old_queue.put(0)
+        for work in works:
+            work.join()
+        self.proxies = []
+        while 1:
+            try:
+                self.profies.append(new_queue.get(timeout=1))
+            except:
+                break
+        print('verify_proxies done!')
+
+    def verify_one_proxy(self, old_queue, new_queue):
+        while 1:
+            proxy = old_queue.get()
+            if proxy == 0:
+                break
+            protocol = 'https' if 'https' in proxy else 'http'
+            proxies = {protocol: proxy}
+            try:
+                if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
+                    print('success %s' % proxy)
+                    new_queue.put(proxy)
+            except:
+                print('fail %s' % proxy)
+
+
+if __name__ == '__main__':
+    a = Proxies()
+    a.verify_proxies()
+    print(a.proxies)
+    proxie = a.profies
+    with open('proxies.txt', 'a') as f:
+        for proxy in proxie:
+            f.write(proxy+'\n')

+ 0 - 0
elabSpider/proxies.txt


+ 129 - 0
elabSpider/settings.py

@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for elabSpider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'elabSpider'
+
+SPIDER_MODULES = ['elabSpider.spiders']
+NEWSPIDER_MODULE = 'elabSpider.spiders'
+
+# MONGO_HOST = '139.196.5.59'  #测试环境
+# MONGO_HOST = '139.196.108.59'    #正式环境,已废弃
+MONGO_HOST = 'mongodb://logdb:logdb@dds-uf6da0fedc9881d41450-pub.mongodb.rds.aliyuncs.com:3717,dds-uf6da0fedc9881d42459-pub.mongodb.rds.aliyuncs.com:3717/logdb?replicaSet=mgset-12835903'
+MONGO_PORT = 27017
+MONGO_DB = 'logdb'
+MONGO_COLL = 'ershoufang'
+MONGO_USER = 'dbuser'
+MONGO_PSW = 'elab@123'
+
+PROXY_HOST = 'http://http-dyn.abuyun.com'
+PROXY_PORT = '9020'
+# PROXY_IDENTIFY = 'HY39548V0FZ45UKD'
+# PROXY_SECRETKEY = '07DBA6C5E470150B'
+
+USER_AGENT_PATH = 'fake_useragent.json'
+
+# LOG_FILE = 'spider.log'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elabSpider (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+CONCURRENT_REQUESTS_PER_IP = 5
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# COOKIES_DEBUG = True
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elabSpider.middlewares.ElabspiderSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+
+DOWNLOADER_MIDDLEWARES = {
+   'elabSpider.middlewares.UserAgent': 1,
+   # 'elabSpider.middlewares.TooManyRequestsRetryMiddleware': 500,
+   'elabSpider.middlewares.ElabspiderDownloaderMiddleware': 543,
+   'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 550,
+   'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
+   'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': None,
+   'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': None,
+   # 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None
+
+}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'elabSpider.pipelines.ElabspiderPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 0.25
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 5.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = True
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+REDIRECT_ENABLED = False
+RETRY_ENABLED = True
+RETRY_TIMES = 10
+RETRY_HTTP_CODES = [403, 429, 404, 301, 302, 503]
+HTTPERROR_ALLOWED_CODES = [403, 429, 404, 301, 302, 503]
+
+DOWNLOAD_TIMEOUT = 15
+
+# RANDOMIZE_DOWNLOAD_DELAY = False
+# CONCURRENT_REQUESTS_PER_IP = 40

+ 4 - 0
elabSpider/spiders/__init__.py

@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

+ 99 - 0
elabSpider/spiders/departmentprice.py

@@ -0,0 +1,99 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import CommunityItem
+import traceback
+from elabSpider.email_util import send_email
+
+
+class DepartmentPriceSpider(scrapy.Spider):
+    name = 'departmentPrice'
+    allowed_domains = [
+        'hangzhou.anjuke.com',
+        'suzhou.zuanjuke.com',
+        'nb.anjuke.com'
+    ]
+    start_urls = [
+        'https://shanghai.anjuke.com/community/xujiahui/',
+        'https://shanghai.anjuke.com/community/nanjingxilu/',
+        'https://shanghai.anjuke.com/community/jingansi/',
+        'https://shanghai.anjuke.com/community/lujiazui/',
+        'https://shanghai.anjuke.com/community/nanjingdonglu/',
+        'https://shanghai.anjuke.com/community/renminguangchang/',
+        'https://shanghai.anjuke.com/community/xintiandia/',
+
+        'https://hangzhou.anjuke.com/community/gulouy/t30/',
+        'https://hangzhou.anjuke.com/community/hubin/t30/',
+        'https://hangzhou.anjuke.com/community/wushana/t30/',
+        'https://hangzhou.anjuke.com/community/wulin/t30/',
+        'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t30/',
+        'https://hangzhou.anjuke.com/community/qianjiangxincheng/t30/',
+
+        'https://suzhou.anjuke.com/community/guanqianjie/t34/',
+        'https://suzhou.anjuke.com/community/pingjianglua/t34/',
+        'https://suzhou.anjuke.com/community/shilus/t34/',
+        'https://suzhou.anjuke.com/community/shishanw/t34/',
+        'https://suzhou.anjuke.com/community/hudongs/t34/',
+        'https://suzhou.anjuke.com/community/huxi/t34/',
+
+        'https://nb.anjuke.com/community/tianyiguangchang/t107/',
+        'https://nb.anjuke.com/community/gulouh/t107/',
+        'https://nb.anjuke.com/community/dongbuxinchengw/t107/',
+        'https://nb.anjuke.com/community/baizhangt/t107/',
+        'https://nb.anjuke.com/community/zhongma/t107/',
+
+
+        'https://hangzhou.anjuke.com/community/gulouy/t29/',
+        'https://hangzhou.anjuke.com/community/hubin/t29/',
+        'https://hangzhou.anjuke.com/community/wushana/t29/',
+        'https://hangzhou.anjuke.com/community/wulin/t29/',
+        'https://hangzhou.anjuke.com/community/xihuwenhuaguangchang/t29/',
+        'https://hangzhou.anjuke.com/community/qianjiangxincheng/t29/',
+
+        'https://suzhou.anjuke.com/community/guanqianjie/t33/',
+        'https://suzhou.anjuke.com/community/pingjianglua/t33/',
+        'https://suzhou.anjuke.com/community/shilus/t33/',
+        'https://suzhou.anjuke.com/community/shishanw/t33/',
+        'https://suzhou.anjuke.com/community/hudongs/t33/',
+        'https://suzhou.anjuke.com/community/huxi/t33/',
+
+        'https://nb.anjuke.com/community/tianyiguangchang/t105/',
+        'https://nb.anjuke.com/community/gulouh/t105/',
+        'https://nb.anjuke.com/community/dongbuxinchengw/t105/',
+        'https://nb.anjuke.com/community/baizhangt/t105/',
+        'https://nb.anjuke.com/community/zhongma/t105/'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//div[@class="maincontent"]/div[@class="list-content"]/div[@_soj="xqlb"]').extract()
+            # house_type = ''
+            house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l pp-mod"]/a[@class="selected-item"]/text()').extract_first()
+            if not house_type:
+                house_type = response.xpath('//div[@class="items no-border-bottom"]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first()
+
+            if not house_type:
+                house_type = response.xpath('//div[@class="items"][3]/span[@class="elems-l "]/a[@class="selected-item"]/text()').extract_first()
+
+            # if not house_type:
+            #     print('error')
+
+            if community_list:
+                for community in community_list:
+                    item = CommunityItem.handle_response(community, house_type)
+                    yield item
+        except:
+            send_email('departmentPrice lv 1 web parse error', response._url + '\n' + traceback.format_exc())
+            print('error info: ', response._url())
+
+        try:
+            next_page = response.xpath(r'//div[@class="page-content"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except:
+            send_email('departmentPrice get next page url error', response._url + '\n' + traceback.format_exc())
+            print('error info: ')

+ 24 - 0
elabSpider/spiders/example.py

@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+from elabSpider.items import *
+
+
+class ExampleSpider(scrapy.Spider):
+    name = 'test'
+    allowed_domains = ['nb.58.com']
+    start_urls = ['http://nb.58.com/ershoufang/37267775882391x.shtml']
+
+    def parse(self, response):
+        try:
+            item = ResoldApartmentItem.handle_response(response)
+            yield item
+        except:
+            print('error' + response.string)
+
+
+    # def parse_item(self, response):
+    #     try:
+    #         ResoldApartmentItem.parse_item(response)
+    #     except:
+    #         print('error' + response.string)
+
+

+ 83 - 0
elabSpider/spiders/fangtianxiacommunity.py

@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+from elabSpider.items import *
+from scrapy.utils.response import get_base_url
+from urllib import parse
+import logging
+import scrapy.core.engine
+from elabSpider.email_util import send_email
+import traceback
+
+
+class ExampleSpider(scrapy.Spider):
+    name = 'fangtianxia'
+    allowed_domains = ['fang.com/']
+    start_urls = [
+        'http://esf.sh.fang.com/housing/25_1633_1_0_0_0_1_0_0_0/',
+        'http://esf.sh.fang.com/housing/19_103_1_0_0_0_1_0_0_0/',
+        'http://esf.sh.fang.com/housing/21_1622_1_0_0_0_1_0_0_0/',
+        'http://esf.sh.fang.com/housing/21_1623_1_0_0_0_1_0_0_0/',
+        'http://esf.sh.fang.com/housing/24_5240_1_0_0_0_1_0_0_0/',
+        'http://esf.sh.fang.com/housing/24_5239_1_0_0_0_1_0_0_0/',
+        'http://esf.sh.fang.com/housing/22_1625_1_0_0_0_1_0_0_0/'
+        'http://esf.hz.fang.com/housing/149__1_0_0_0_1_0_0_0/',
+        'http://esf.hz.fang.com/housing/150__1_0_0_0_1_0_0_0/',
+        'http://esf.hz.fang.com/housing/153__1_0_0_0_1_0_0_0/',
+        'http://esf.suzhou.fang.com/housing/13102__1_0_0_0_1_0_0_0/',
+        'http://esf.suzhou.fang.com/housing/278_4008_1_0_0_0_1_0_0_0/',
+        'http://esf.suzhou.fang.com/housing/277__1_0_0_0_1_0_0_0/',
+        'http://esf.nb.fang.com/housing/162_4220_1_0_0_0_1_0_0_0/',
+        'http://esf.nb.fang.com/housing/162_13968_1_0_0_0_1_0_0_0/',
+        'http://esf.nb.fang.com/housing/1047_13973_1_0_0_0_1_0_0_0/',
+        'http://esf.nb.fang.com/housing/1047_17420_1_0_0_0_1_0_0_0/',
+        'http://esf.nb.fang.com/housing/164__1_0_0_0_1_0_0_0/'
+    ]
+    # start_urls = ['http://huanqiuguangchang2.fang.com/xiangqing/']
+    # rules = (
+    #     Rule(link_extractor=r'http://[.]+(\.)fang.com/xiangqing', callback='parse_item')
+    # )
+
+    def parse(self, response):
+        # item = FTXCommunityItem.handle_response(response)
+        # yield item
+        try:
+            for href in response.xpath(r'//a[@class="plotTit"]/@href'):
+                url = href.extract() # type: str
+                if not url.startswith('http'):
+                    url = parse.urljoin(get_base_url(response), url)
+                yield scrapy.Request(url, callback=self.parse_subweb, dont_filter=True)
+
+        except Exception as err:
+            send_email('fangtianxia lv 1 web parse error', response._url + '\n' + traceback.format_exc())
+            msg = 'lv 1 web parse error url: ' + response._url + '-'.join(err.args)
+            logging.error(msg=msg)
+
+        try:
+            next_page = response.xpath(r'//div[@class="fanye gray6"]/a[@id="PageControl1_hlk_next"]/@href').extract_first()
+            if next_page:
+                base_url = get_base_url(response)
+                full_url = parse.urljoin(base_url, next_page)
+                yield scrapy.Request(full_url, callback=self.parse, dont_filter=True)
+        except Exception as err:
+            send_email('fangtianxia next page url parse error', response._url + '\n' + traceback.format_exc())
+            msg = 'next page url parse error url: ' + response._url + '-'.join(err.args)
+            logging.error(msg=msg)
+
+    def parse_subweb(self, response):
+        try:
+            url = response.xpath(r'//li[@data="xqxq"]/a/@href').extract_first()
+            yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
+        except Exception as err:
+            send_email('fangtianxia get detail url error', response._url + '\n' + traceback.format_exc())
+            msg = 'get detail url error url: ' + response._url + '-'.join(err.args)
+            logging.error(msg=msg)
+
+    def parse_item(self, response):
+        try:
+            item = FTXCommunityItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('fangtianxia lv 2 web parse error', response._url + '\n' + traceback.format_exc())
+            msg = 'lv 2 web parse error url: ' + response._url + '-'.join(err.args)
+            logging.error(msg=msg)
+
+

+ 50 - 0
elabSpider/spiders/lfs_rental_house.py

@@ -0,0 +1,50 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import RentalHouseItem
+import logging
+from elabSpider.email_util import send_email
+import traceback
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'lfsrentalHouse'
+    allowed_domains = [
+        'nb.zu.anjuke.com',
+        'nb.anjuke.com'
+    ]
+    start_urls = [
+        'https://nb.anjuke.com/community/props/rent/1003094'
+        # 'https://nb.zu.anjuke.com/rent/F717483045'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//ul[@class="m-house-list"]/li/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    yield scrapy.Request(community_url, callback=self.parse_item)
+        except Exception as err:
+            send_email('lfsrentalHouse get detail url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="m-page"]/div[@class="multi-page"]/a[@class="aNxt"]/@href.....').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except Exception as err:
+            send_email('lfsrentalHouse get next page error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = RentalHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('lfsrentalHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 48 - 0
elabSpider/spiders/lfs_resold.py

@@ -0,0 +1,48 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import ResoldHouseItem
+import logging
+import traceback
+from elabSpider.email_util import send_email
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'lfsresoldHouse'
+    allowed_domains = [
+        'nb.anjuke.com'
+    ]
+    start_urls = [
+        'https://nb.anjuke.com/community/props/sale/1003094/'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//ul[@class="m-house-list"]/li/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    yield scrapy.Request(community_url, callback=self.parse_item)
+        except Exception as err:
+            send_email('sjkresoldHouse get detail url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="m-page"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except Exception as err:
+            send_email('sjkresoldHouse get next page url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = ResoldHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('sjkresoldHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 24 - 0
elabSpider/spiders/lfs_sold_average.py

@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import logging
+from elabSpider.items import LfsAveragePriceItem
+import traceback
+from elabSpider.email_util import send_email
+
+
+class LfsSpider(scrapy.Spider):
+    name = 'lfsSoldAverage'
+    allowed_domains = ['nb.anjuke.com']
+    start_urls = ['https://nb.anjuke.com/community/view/1003094']
+    # start_urls = ['https://nb.anjuke.com/community/view/275965?from=Filter_1&hfilter=filterlist']
+
+    def parse(self, response):
+        try:
+            item = LfsAveragePriceItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('lfsSoldAverage parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+
+

+ 51 - 0
elabSpider/spiders/nb_ftx_rental_house.py

@@ -0,0 +1,51 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from scrapy.utils.response  import get_base_url
+from urllib import parse
+from elabSpider.items import FTXRentalHouseItem
+import logging
+import traceback
+from elabSpider.email_util import send_email
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'ftxrentalHouse'
+    allowed_domains = [
+        'zu.nb.fang.com'
+    ]
+    start_urls = [
+        'http://zu.nb.fang.com/house/h316-n31/'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//div[@class="houseList"]/dl/dd[@class="info rel"]/p[@class="title"]/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    if community_url.startswith('/chuzu/'):
+                        yield scrapy.Request(parse.urljoin(get_base_url(response), community_url), callback=self.parse_item)
+        except Exception as err:
+            send_email('ftxrentalHouse lv 1 web parse error', response._url + '\n' + traceback.format_exc())
+            logging.error(' error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="fanye"]/a[text()="下一页"]/@href').extract_first()
+            if next_page and next_page.startswith('/house/'):
+                yield scrapy.Request(parse.urljoin(get_base_url(response), next_page), callback=self.parse)
+        except Exception as err:
+            send_email('ftxrentalHouse get next page url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = FTXRentalHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('ftxrentalHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 49 - 0
elabSpider/spiders/nb_rental_house.py

@@ -0,0 +1,49 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import RentalHouseItem
+import logging
+import traceback
+from elabSpider.email_util import send_email
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'rentalHouse'
+    allowed_domains = [
+        'nb.zu.anjuke.com'
+    ]
+    start_urls = [
+        'https://nb.zu.anjuke.com/fangyuan/lx8-px3-x1/',
+        'https://nb.zu.anjuke.com/fangyuan/lx1-px3-x1/'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//div[@class="maincontent"]/div[@class="list-content"]/div[contains(@class, "zu-itemmod")]/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    yield scrapy.Request(community_url, callback=self.parse_item)
+        except Exception as err:
+            send_email('rentalHouse get detail url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="page-content"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except Exception as err:
+            send_email('rentalHouse get next page url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = RentalHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('rentalHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 50 - 0
elabSpider/spiders/nb_resold.py

@@ -0,0 +1,50 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import ResoldHouseItem
+import logging
+import traceback
+from elabSpider.email_util import send_email
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'nbresoldHouse'
+    allowed_domains = [
+        'nb.anjuke.com'
+    ]
+    start_urls = [
+        'https://nb.anjuke.com/sale/o5-t105/',
+        'https://nb.anjuke.com/sale/o5-t107/'
+        # 'https://nb.anjuke.com/prop/view/A1237992888?from=filter&spread=filtersearch_p&position=117&kwtype=filter&now_time=1526637680'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//ul[@id="houselist-mod-new"]/li/div[@class="house-details"]/div[@class="house-title"]/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    yield scrapy.Request(community_url, callback=self.parse_item)
+        except Exception as err:
+            send_email('nbresoldHouse get detail url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except Exception as err:
+            send_email('nbresoldHouse get next page url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = ResoldHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('nbresoldHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 38 - 0
elabSpider/spiders/resoldapartment.py

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from elabSpider.items import *
+import traceback
+from elabSpider.email_util import send_email
+
+
+class ExampleSpider(scrapy.Spider):
+    name = '58ershoufang'
+    allowed_domains = ['58.com']
+    start_urls = ['http://nb.58.com/haishu/ershoufang/']
+
+    def parse(self, response):
+        try:
+            for href in response.xpath(r'//ul[@class="house-list-wrap"]/li/div[@class="list-info"]/h2[@class="title"]/a/@href'):
+                url = href.extract()
+                yield scrapy.Request(url, callback=self.parse_item)
+        except:
+            send_email('58ershoufang lv 1 url parse error', response._url + '\n' + traceback.format_exc())
+            print('error')
+
+        try:
+            next_page = response.xpath(r'//div[@class="pager"]/a[@class="next"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except:
+            send_email('58ershoufang get next url error', response._url + '\n' + traceback.format_exc())
+            print('error next page')
+
+    def parse_item(self, response):
+        try:
+            item = ResoldApartmentItem.handle_response(response)
+            yield item
+        except:
+            send_email('58ershoufang get item parse error', response._url + '\n' + traceback.format_exc())
+            print('error' + response.string)
+
+

+ 102 - 0
elabSpider/spiders/sjk_rental_house.py

@@ -0,0 +1,102 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import RentalHouseItem
+import logging
+import traceback
+from elabSpider.email_util import send_email
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'sjkrentalHouse'
+    allowed_domains = [
+        'nb.zu.anjuke.com',
+        'nb.anjuke.com'
+    ]
+    start_urls = [
+        'https://nb.anjuke.com/community/props/rent/275642/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275642/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/1003094/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/1003094/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275869/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275869/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/973807/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/973807/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/973808/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/973808/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275517/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275517/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/1000067/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/1000067/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/406899/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/406899/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/1016525/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/1016525/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275936/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275936/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/1017728/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/1017728/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275274/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275274/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275658/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275658/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275386/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275386/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/1006982/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/1006982/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/275764/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/275764/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/792725/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/792725/lx8-x1/',
+
+        'https://nb.anjuke.com/community/props/rent/1022250/lx1-x1/',
+        'https://nb.anjuke.com/community/props/rent/1022250/lx8-x1/'
+
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//ul[@class="m-house-list"]/li/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    yield scrapy.Request(community_url, callback=self.parse_item)
+        except Exception as err:
+            send_email('sjkrentalHouse get detail url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="m-page"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except Exception as err:
+            send_email('sjkrentalHouse get next page url parse error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = RentalHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('sjkrentalHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 66 - 0
elabSpider/spiders/sjk_resold.py

@@ -0,0 +1,66 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/27 10:50 AM
+
+# @Author  : Swing
+
+import scrapy
+from elabSpider.items import ResoldHouseItem
+import logging
+import traceback
+from elabSpider.email_util import send_email
+
+
+class RentalHouseSpider(scrapy.Spider):
+    name = 'sjkresoldHouse'
+    allowed_domains = [
+        'nb.anjuke.com'
+    ]
+    start_urls = [
+        'https://nb.anjuke.com/community/props/sale/275642/',
+        'https://nb.anjuke.com/community/props/sale/1003094/',
+        'https://nb.anjuke.com/community/props/sale/275869/',
+        'https://nb.anjuke.com/community/props/sale/973807/',
+        'https://nb.anjuke.com/community/props/sale/973808/',
+        'https://nb.anjuke.com/community/props/sale/275517/',
+        'https://nb.anjuke.com/community/props/sale/1000067/',
+        'https://nb.anjuke.com/community/props/sale/406899/',
+        'https://nb.anjuke.com/community/props/sale/1016525/',
+        'https://nb.anjuke.com/community/props/sale/275936/',
+        'https://nb.anjuke.com/community/props/sale/1017728/',
+        'https://nb.anjuke.com/community/props/sale/275274/',
+        'https://nb.anjuke.com/community/props/sale/275658/',
+        'https://nb.anjuke.com/community/props/sale/275386/',
+        'https://nb.anjuke.com/community/props/sale/1006982/',
+        'https://nb.anjuke.com/community/props/sale/275764/',
+        'https://nb.anjuke.com/community/props/sale/792725/',
+        'https://nb.anjuke.com/community/props/sale/1022250/'
+        # 'https://nb.anjuke.com/prop/view/A1237992888?from=filter&spread=filtersearch_p&position=117&kwtype=filter&now_time=1526637680'
+    ]
+
+    def parse(self, response):
+        try:
+            community_list = response.xpath('//ul[@class="m-house-list"]/li/a/@href').extract()
+
+            if community_list:
+                for community_url in community_list:
+                    yield scrapy.Request(community_url, callback=self.parse_item)
+        except Exception as err:
+            send_email('sjkresoldHouse get detail url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get detail url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+        try:
+            next_page = response.xpath(r'//div[@class="m-page"]/div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first()
+            if next_page:
+                yield scrapy.Request(next_page, callback=self.parse)
+        except Exception as err:
+            send_email('sjkresoldHouse get next page url error', response._url + '\n' + traceback.format_exc())
+            logging.error('get next page url error ! url: ' + response._url + " reason: " + '-'.join(err.args))
+
+    def parse_item(self, response):
+        try:
+            item = ResoldHouseItem.handle_response(response)
+            yield item
+        except Exception as err:
+            send_email('sjkresoldHouse parse response error', response._url + '\n' + traceback.format_exc())
+            logging.error('parse response error ! url: ' + response._url + " reason: " + '-'.join(err.args))

+ 12 - 0
elabSpider/spiders/test_db.py

@@ -0,0 +1,12 @@
+import pymongo
+from elabSpider.items import ResoldApartmentItem
+
+uri = "mongodb://logdb:logdb@dds-uf6da0fedc9881d41450-pub.mongodb.rds.aliyuncs.com:3717,dds-uf6da0fedc9881d42459-pub.mongodb.rds.aliyuncs.com:3717/logdb?replicaSet=mgset-12835903"
+client = pymongo.MongoClient(uri, authSource='logdb')
+
+db = client['logdb']
+coll = db['test_coll']
+
+item = ResoldApartmentItem()
+item['title'] = '我是测试标题'
+coll.insert_one(item)

+ 16 - 0
elabSpider/start.py

@@ -0,0 +1,16 @@
+# -*- coding:utf-8 -*-
+
+# @Time    : 2018/4/23 2:58 PM
+
+# @Author  : Swing
+
+
+from scrapy import cmdline
+
+# cmdline.execute("scrapy crawl ftxrentalHouse".split())
+# cmdline.execute("scrapy crawl rentalHouse".split())
+cmdline.execute("scrapy crawl nbresoldHouse".split())
+# cmdline.execute("scrapy crawl lfsrentalHouse".split())
+# cmdline.execute("scrapy crawl lfsSoldAverage".split())
+# cmdline.execute("scrapy crawl test".split())
+

+ 11 - 0
scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = elabSpider.settings
+
+[deploy]
+url = http://0.0.0.0:8080/
+project = elabSpider