欢迎光临
我们一直在努力

获取和验证HTTP代理的脚本

#coding = utf-8
import httplib
import time
import urllib
import threading
import urllib2
import urllib2 as url
import random
import httplib
import urllib
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool

user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
​        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
​        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
​        (KHTML, like Gecko) Element Browser 5.0', \
​        'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
​        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
​        'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
​        'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
​        Version/6.0 Mobile/10A5355d Safari/8536.25', \
​        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
​        Chrome/28.0.1468.0 Safari/537.36', \
​        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']

def get_proxy():
​    pf = open('proxy.txt' , 'w')
​    for page in range(1, 200):
​        url = '<http://www.xici.net.co/nn/'>; + str(page)
​        request = urllib2.Request(url)
​        user_agent = random.choice(user_agents)
​        request.add_header('User-agent', user_agent)
​        response = urllib2.urlopen(request)
​        html = response.read()

​        soup = BeautifulSoup(html,'lxml')
​        trs = soup.find('table', id='ip_list').find_all('tr')

​        for tr in trs[1:]:
​            tds = tr.find_all('td')
​            ip = tds[1].text.strip()
​            port = tds[2].text.strip()
​            protocol = tds[5].text.strip()
​            if protocol == 'HTTP':
​                pf.write('%s=%s:%s\n' % (protocol, ip, port) )
​                print '%s=%s:%s' % (protocol, ip, port)
​    pf.close()

def var_proxy():
​    inFile = open('proxy.txt', 'r')
​    outFile = open('available.txt', 'w')
​    lines = inFile.readlines()
​    inFile.close()
​    #for line in inFile.readlines():
​    def check(line):  
​        protocol, proxy = line.strip().split('=')
​        ip, port = proxy.split(':')
​        headers = {'Content-Type': 'application/x-www-form-urlencoded',
​                   'Cookie': '',
​                   'Referer':'[http://www.baidu.com'}](http://www.baidu.com%27%7D/)
​        try:
​            conn = httplib.HTTPConnection(proxy, timeout=3.0)
​            conn.request(method='GET', url='<http://1212.ip138.com/ic.asp',headers=headers>)
​            res = conn.getresponse()
​            html_doc = res.read()
​            if html_doc.find(ip)>0:
​                print proxy
​                outFile.write('%s\n' % (proxy) )
​            else:
​                print "error!"
​        except Exception, e:
​            print e
​            pass
​    pool = ThreadPool(20)
​    result = pool.map(check,lines)
​    pool.close()
​    pool.join()
​    outFile.close()

if **name** == '**main**':
​    print "getproxy 1"
​    print "varproxy 2"
​    flag = raw_input()
​    if flag == "1":
​        get_proxy()
​    elif flag == "2":
​        var_proxy()
​    else:
​        print "input error!"

未经允许不得转载:Caldow » 获取和验证HTTP代理的脚本
分享到: 生成海报

切换注册

登录

忘记密码 ?

切换登录

注册

我们将发送一封验证邮件至你的邮箱, 请正确填写以完成账号注册和激活