proxies.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # -*- coding:utf-8 -*-
  2. # @Time : 2018/4/19 3:47 PM
  3. # @Author : Swing
  4. from bs4 import BeautifulSoup
  5. import lxml
  6. from multiprocessing import Process, Queue
  7. import random
  8. import json
  9. import time
  10. import requests
  11. class Proxies(object):
  12. def __init__(self, page=3):
  13. self.profies = []
  14. self.verify_pro = []
  15. self.page = page
  16. self.headers = {
  17. 'Accept': '*/*',
  18. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
  19. 'Accept-Encoding': 'gzip, deflate, sdch',
  20. 'Accept-Language': 'zh-CN,zh;q=0.8'
  21. }
  22. self.get_proxies()
  23. self.get_proxies_nn()
  24. def get_proxies(self):
  25. page = random.randint(1, 10)
  26. page_stop = page + self.page
  27. while page < page_stop:
  28. url = 'http://www.xicidaili.com/nt/%d' % page
  29. html = requests.get(url, headers=self.headers).content
  30. soup = BeautifulSoup(html, 'lxml')
  31. ip_list = soup.find(id='ip_list')
  32. for odd in ip_list.find_all(class_='odd'):
  33. protocol = odd.find_all('td')[5].get_text().lower() + '://'
  34. self.profies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))
  35. page += 1
  36. def get_proxies_nn(self):
  37. page = random.randint(1, 10)
  38. page_stop = page + self.page
  39. while page < page_stop:
  40. url = 'http://www.xicidaili.com/nn/%d' % page
  41. html = requests.get(url, headers=self.headers).content
  42. soup = BeautifulSoup(html, 'lxml')
  43. ip_list = soup.find(id='ip_list')
  44. for odd in ip_list.find_all(class_='odd'):
  45. protocol = odd.find_all('td')[5].get_text().lower() + '://'
  46. self.profies.append(protocol + ':'.join([x.get_text() for x in odd.find_all('td')[1:3]]))
  47. page += 1
  48. def verify_proxies(self):
  49. # 没验证的代理
  50. old_queue = Queue()
  51. # 验证后的代理
  52. new_queue = Queue()
  53. print('verify proxy......')
  54. works = []
  55. for i in range(15):
  56. works.append(Process(target=self.verify_one_proxy, args=(old_queue, new_queue)))
  57. for work in works:
  58. work.start()
  59. for proxy in self.profies:
  60. old_queue.put(proxy)
  61. for work in works:
  62. old_queue.put(0)
  63. for work in works:
  64. work.join()
  65. self.proxies = []
  66. while 1:
  67. try:
  68. self.profies.append(new_queue.get(timeout=1))
  69. except:
  70. break
  71. print('verify_proxies done!')
  72. def verify_one_proxy(self, old_queue, new_queue):
  73. while 1:
  74. proxy = old_queue.get()
  75. if proxy == 0:
  76. break
  77. protocol = 'https' if 'https' in proxy else 'http'
  78. proxies = {protocol: proxy}
  79. try:
  80. if requests.get('http://www.baidu.com', proxies=proxies, timeout=2).status_code == 200:
  81. print('success %s' % proxy)
  82. new_queue.put(proxy)
  83. except:
  84. print('fail %s' % proxy)
  85. if __name__ == '__main__':
  86. a = Proxies()
  87. a.verify_proxies()
  88. print(a.proxies)
  89. proxie = a.profies
  90. with open('proxies.txt', 'a') as f:
  91. for proxy in proxie:
  92. f.write(proxy+'\n')