Код был бы на много короче (1 регулярка), но написаны функции для обработки сайтов с “защитой” от автоматического граббинга.
#!/usr/bin/env python
import re, urllib, urllib2, ConfigParser, threading, time
from string import split
from Queue import Queue
file = open('list.txt' , 'r')
s_list = file.readlines()
file.close()
config = ConfigParser.RawConfigParser()
config.read('config.cfg')
cookie = config.getint('Cookie', 'cookie')
queue = Queue()
LOCK = threading.RLock()
def main():
print 'Started'
threads = config.getint('Threads', 'threads')
for i in xrange(len(s_list)):
queue.put(s_list[i])
for _ in xrange(threads):
thread_ = threading.Thread(target=worker)
thread_.start()
while threading.activeCount() > 1:
time.sleep(1)
print "Finished"
def worker():
while True:
try:
target_link = queue.get_nowait()
except Queue.Empty, error:
return
parsed_data = pars(target_link)
if parsed_data == "ERROR":
queue.put(target_link)
def hidemyass(url):
sock2 = urllib2.urlopen(url)
mata = sock2.read()
sock2.close()
many = 0
rem = re.findall('(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', mata, re.DOTALL)
rem1 = re.findall('\n(?:[\d]{1,6})</td>', mata, re.DOTALL)
for n in range(len(rem)):
good = rem[n] + ':' + rem1[n][1:].replace('</td>', '')
open("proxy.txt", "a+").write(good + '\n')
many = many + 1
print url + ' parsed ' + str(many) + ' proxy'
def nntime(url):
sock = urllib2.urlopen(url)
data = sock.read()
sock.close()
ip = re.findall('<td>((?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3}))<script', data, re.DOTALL)
port = re.findall('<script type="text/javascript">document.write\(":"(.+?)\)</script></td>', data, re.DOTALL)
digit0 = re.findall('([a-z])=0', data, re.DOTALL)
digit1 = re.findall('([a-z])=1', data, re.DOTALL)
digit2 = re.findall('([a-z])=2', data, re.DOTALL)
digit3 = re.findall('([a-z])=3', data, re.DOTALL)
digit4 = re.findall('([a-z])=4', data, re.DOTALL)
digit5 = re.findall('([a-z])=5', data, re.DOTALL)
digit6 = re.findall('([a-z])=6', data, re.DOTALL)
digit7 = re.findall('([a-z])=7', data, re.DOTALL)
digit8 = re.findall('([a-z])=8', data, re.DOTALL)
digit9 = re.findall('([a-z])=9', data, re.DOTALL)
for q in range(len(port)):
good = port[q].replace('+', '').replace(digit0[0], '0').replace(digit1[0], '1').replace(digit2[0], '2').replace(digit3[0], '3').replace(digit4[0], '4').replace(digit5[0], '5').replace(digit6[0], '6').replace(digit7[0], '7').replace(digit8[0], '8').replace(digit9[0], '9')
proxy = ip[q] + ':' + good
open("proxy.txt", "a+").write(proxy + '\n')
print url + ' parsed ' + str(len(ip)) + ' proxy\n'
def topgen(url):
sock = urllib2.urlopen(url)
data = sock.read()
sock.close()
res = re.findall('type=\"text/javascript\">document.write((.+?))</script>', data, re.DOTALL)
char6 = re.findall('<script type="text/javascript">\n(.+?)="6";', data, re.DOTALL)
char6=str(char6)[2:-2]
char3 = re.findall('="6";\n(.+?)="3";', data, re.DOTALL)
char3=str(char3)[2:-2]
char7 = re.findall('="3";\n(.+?)="7";', data, re.DOTALL)
char7=str(char7)[2:-2]
char9 = re.findall('="7";\n(.+?)="9";', data, re.DOTALL)
char9=str(char9)[2:-2]
char2 = re.findall('="9";\n(.+?)="2";', data, re.DOTALL)
char2=str(char2)[2:-2]
char8 = re.findall('="2";\n(.+?)="8";', data, re.DOTALL)
char8=str(char8)[2:-2]
char1 = re.findall('="8";\n(.+?)="1";', data, re.DOTALL)
char1=str(char1)[2:-2]
char4 = re.findall('="1";\n(.+?)="4";', data, re.DOTALL)
char4=str(char4)[2:-2]
char0 = re.findall('="4";\n(.+?)="0";', data, re.DOTALL)
char0=str(char0)[2:-2]
char5 = re.findall('="0";\n(.+?)="5";', data, re.DOTALL)
char5=str(char5)[2:-2]
reg = str(res)[2:-2]
reg = reg.replace('+','').replace('\"','').replace(char6,'6').replace(char3,'3').replace(char7,'7').replace(char9,'9').replace(char2,'2').replace(char8,'8').replace(char1,'1').replace(char4,'4').replace(char0,'0').replace(char5,'5').replace(')\'),','\n').replace(')\'', '\n').replace(' \'(' ,'').replace('(', '').replace(',','').replace('\'', '').replace(' ','')
open("proxy.txt", "a+").write(reg[1:-1] + '\n')
set = len(reg.split('\n')) - 1
print url + ' parsed ' + str(set) + ' proxy'
def proxyforest(url):
sock = urllib2.urlopen(url)
data = sock.read()
sock.close()
res = re.findall('([1-4],\'.*?\',\'.*?\',\'.*?\',\'.*?\',[0-9]{1,4})', data, re.DOTALL)
for i in range(len(res)):
pars = res[i]
if str(pars)[0] == '4':
repars = pars.split(',')
good = repars[2] + '.' + repars[3] + '.' + repars[4] + '.' + repars[1] + ':' + repars[5]
good = good.replace("'", "")
open("proxy.txt", "a+").write(good + '\n')
elif str(pars)[0] == '3':
repars = pars.split(',')
good = repars[3] + '.' + repars[4] + '.' + repars[1] + '.' + repars[2] + ':' + repars[5]
good = good.replace("'", "")
open("proxy.txt", "a+").write(good + '\n')
elif str(pars)[0] == '2':
repars = pars.split(',')
good = repars[4] + '.' + repars[1] + '.' + repars[2] + '.' + repars[3] + ':' + repars[5]
good = good.replace("'", "")
open("proxy.txt", "a+").write(good + '\n')
elif str(pars)[0] == '1':
repars = pars.split(',')
good = repars[1] + '.' + repars[2] + '.' + repars[3] + '.' + repars[4] + ':' + repars[5]
good = good.replace("'", "")
open("proxy.txt", "a+").write(good + '\n')
print url + ' parsed ' + str(len(res)) + ' proxy'
def xroxy(url):
sock = urllib2.urlopen(url)
data = sock.read()
sock.close()
many = 0
ip = re.findall('<td><a href=\'proxy:name=XROXY proxy&host=(.+?)&port=', data, re.DOTALL)
port = re.findall('&port=(.+?)¬es=', data, re.DOTALL)
for i in range(len(ip)):
good = ip[i] + ':' + port[i]
open("proxy.txt", "a+").write(good + '\n')
many = many + 1
print url + ' parsed ' + str(many) + ' proxy'
def normal(url):
page_open = urllib2.urlopen(page)
source = page_open.read()
page_open.close
proxy = re.findall('(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\:(?:[\d]{1,4})', source, re.DOTALL)
for x in range(len(proxy)):
open("proxy.txt", "a+").write(proxy[x] + '\n')
print page + ' parsed ' + str(len(proxy)) + ' proxy'
if cookie == 1:
urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor))
def pars(url):
for i in range(len(s_list)):
page = s_list[i]
if page.find('topgen')>0:
topgen(page)
elif page.find('proxyforest.com')>0:
proxyforest(page)
elif page.find('xroxy.com')>0:
xroxy(page)
elif page.find('hidemyass.com')>0:
hidemyass(page)
elif page.find('nntime')>0:
nntime(page)
else:
normal(page)
if __name__ == "__main__":
main()
raw_input()