Есть исходный скрипт (анализирует сайтмап.xml, заходит на каждую ссылку и дергает оттуда ключевики (использует бьютифул суп)) :
# -*- coding: cp1251 -*-
import urllib, urlparse, re
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
url_start = "http://site.ru"
url_end = "sitemap.xml"
url = urlparse.urljoin(url_start, url_end)
print u"Ждите ..."
link = urllib.urlopen(url)
sup = BeautifulStoneSoup(link)
supik = str(sup)
k = 1
p = re.compile(r"<loc>(.*?)</loc>", re.S | re.I)
def starting(gg):
global k
link1 = urllib.urlopen(gg)
soup = BeautifulSoup(link1, fromEncoding="utf-8")
a = soup(attrs={"name": "Keywords"})[0]["content"]
f1 = open("file1.txt", "a")
f1.write(a.encode("cp1251") + "\n")
f1.close()
link1.close()
print u"Спарсили ключевик № " + str(k)
k += 1
q = raw_input("Parse ? y/n: ")
if q == "y":
f = open("file1.txt", "w")
for i in p.findall(supik):
starting(i)
f.close()
link.close()
else:
print "Okay ..."
raw_input()
# -*- coding: cp1251 -*-
import urllib, urlparse, re, Queue, threading, time
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
queue = Queue.Queue()
lock = threading.RLock()
threads_count = 100
url_start = "http://site.ru"
url_end = "sitemap.xml"
url = urlparse.urljoin(url_start, url_end)
print u"Ждите ..."
link = urllib.urlopen(url)
sup = BeautifulStoneSoup(link)
supik = str(sup)
k = 1
p = re.compile(r"<loc>(.*?)</loc>", re.S | re.I)
def worker():
global queue
while True:
try:
target = queue.get_nowait()
except Queue.Empty:
return
starting(target)
def write(data, s):
global k
lock.acquire()
if s == True:
f1 = open("file1.txt", "a")
f1.write(data + "\n")
f1.close()
elif s != True:
pass
lock.release()
print u"Спарсили ключевик № " + str(k)
k += 1
def starting(gg):
try:
link1 = urllib.urlopen(gg)
soup = BeautifulSoup(link1, fromEncoding="utf-8")
a = soup(attrs={"name": "Keywords"})[0]["content"]
b = a.encode("cp1251")
write(b, True)
link1.close()
except:
write(b, False)
def main():
global link
f = open("file1.txt", "w")
for i in p.findall(supik):
queue.put(i)
for _ in xrange(threads_count):
thread_ = threading.Thread(target=worker)
thread_.start()
while threading.active_count() >1:
time.sleep(1)
f.close()
link.close()
q = raw_input("Parse ? y/n: ")
if q == "y":
main()
else:
print "Okay ..."
raw_input()