Форум сайта python.su
вот мой
# -*- coding: UTF-8 -*-
import sys, os, urllib2, subprocess, xlrd, YOUR_DB_DRIVER, logging as log
conn = YOUR_DB_DRIVER.connect(...)
cur = conn.cursor()
URLs = '''http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%90%D0%BA%D0%BC%D0%BE%D0%BB%20%D0%BE%D0%B1%D0%BB/11.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%90%D0%BA%D1%82%D1%8E%D0%B1%D0%B8%D0%BD/15.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%90%D0%BB%D0%BC%D0%B0%D1%82/19.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%90%D1%82%D1%8B%D1%80%D0%B0%D1%83/23.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%97%D0%9A%D0%9E/27.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%96%D0%B0%D0%BC%D0%B1%D1%8B%D0%BB/31.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%9A%D0%B0%D1%80%D0%B0%D0%B3%D0%B0%D0%BD%D0%B4%D0%B0/35.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%9A%D0%BE%D1%81%D1%82%D0%B0%D0%BD%D0%B0%D0%B9/39.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%9A%D1%8B%D0%B7%D1%8B%D0%BB%D0%BE%D1%80%D0%B4%D0%B0/43.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%9C%D0%B0%D0%BD%D0%B3%D0%B8%D1%81%D1%82%D0%B0%D1%83/47.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%AE%D0%9A%D0%9E/51.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%9F%D0%B0%D0%B2%D0%BB%D0%BE%D0%B4%D0%B0%D1%80/55.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%A1%D0%9A%D0%9E/59.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%92%D0%9A%D0%9E/63.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%B3.%20%D0%90%D1%81%D1%82%D0%B0%D0%BD%D0%B0/71.rar
http://www.stat.kz/digital/bizness_registr/Documents/%D1%81%D1%82%D0%B0%D1%82%20%D1%80%D0%B5%D0%B3%D0%B8%D1%81%D1%82%D1%80/%D0%B3.%20%D0%90%D0%BB%D0%BC%D0%B0%D1%82%D1%8B/75.rar'''
def dload(url):
'''Downloads an object'''
log.info('dloading %s ...' % url)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Opera/9.50 (Linux x86_64; U; en)')]
return opener.open(url).read()
def launchWithoutConsole(command, args):
"""Launches 'command' windowless and waits until finished"""
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
return subprocess.Popen([command] + args, startupinfo=startupinfo).wait()
for url in URLs.split('\n'):
rarfile=url.split('/')[-1]
fout = open('./LEGAL_PERSON' + os.sep + rarfile, "wb")
fout.write(dload(url))
fout.close()
try:
launchWithoutConsole('./LEGAL_PERSON' + os.sep + 'UnRAR.exe',
['x', '-o+', './LEGAL_PERSON' + os.sep + rarfile, './LEGAL_PERSON'] )
except (WindowsError, os.error), e:
log.exception(e)
log.error("UnRAR failed for %s" % rarfile)
sys.exit(787)
for book_file in (file for file in os.listdir('./LEGAL_PERSON') if file.endswith('.xls')):
log.info('processing %s ...' % book_file)
book = xlrd.open_workbook(u"./LEGAL_PERSON/" + book_file, on_demand=True )
for sh in book.sheets():
for row in xrange(3, sh.nrows): #
try:
out = {}
out['L_PERS_BIN']=unicode(sh.cell(row, 0).value) # row/col
out['L_PERS_OKPO']=unicode(sh.cell(row, 1).value)
out['L_PERS_NAME']=unicode(sh.cell(row, 2).value)
out['L_PERS_CITY']=unicode(sh.cell(row, 6).value)
sql = """Insert into Nsitemp.LEGAL_PERSON_ALL
(%s)
values (%s)""" % ( ','.join(out.keys()),
','.join('?'*len(out))
)
cur.execute(sql, [(value.strip().replace(' ',' ')
if isinstance(value,unicode)
else value)
for value in out.values()] )
except Exception, e:
log.exception(e)
sys.exit(777)
if row%50==0: conn.commit()
conn.commit()
Отредактировано (Окт. 28, 2010 12:31:37)
Офлайн
Я дико извиняюсь, dvs а где импорты?
Офлайн
helm2004
мыши погрызли
Офлайн
dvsВот такое получилось.
Давай выложим свои варианты – может кому польза будет.
# -*- coding: utf-8 -*-
import urllib
import re
import os
import commands
import sys
import xlrd
import sqlite3
WORK_URL = 'http://www.stat.kz/digital/bizness_registr/Pages/stat_reg.aspx'
RAR_SAVE_PATH = '/home/pill/py/rars'
XLS_UNPACK_PATH = '/home/pill/py/xls'
DB_PATH = '/home/pill/py/sql3_db'
def get_archives(path):
"downloads archives from work_url to specified location"
if not os.path.exists(path):
if raw_input("%s path dosen't exist, do you want to create it? (y/n) "%path) == 'y':
os.mkdir(path)
else:
sys.exit(0)
rars_list = []
try:
sock = urllib.urlopen(WORK_URL)
data = sock.read()
sock.close()
except IOError:
print "Couldn't connect to %s" %work_url
return None
pass
rars_list = [''.join(('http://www.stat.kz',x)) for x in re.findall(r'href="(.+?rar)"',data)]
filenames = []
print 'retriaving: ',
for url in rars_list:
filename = os.path.join(path,re.search(r'(\d+\.rar)',url).group(0))
filenames.append(filename)
try:
urllib.urlretrieve(url, filename)
except IOError:
print "Couldn't reatrive or save file %s" %filename
return None
print '.',
return path
def unpack_archives(in_path, out_path):
"unpack archives to the same location"
if not os.path.exists(out_path):
if raw_input("%s path dosen't exist, do you want to create it? (y/n) "%out_path) == 'y':
os.mkdir(out_path)
else:
sys.exit(0)
print 'Extracting: .',
filenames = [os.path.join(in_path, name) for name in os.listdir(in_path) if name.endswith('rar')]
for filename in filenames:
cmd = ' '.join(('rar e',filename, out_path))
(status, output) = commands.getstatusoutput(cmd)
if status: ## Error case, print the command's output to stderr and exit
sys.stderr.write(output)
return None
print '.',
return [os.path.join(out_path, re.search(r'\d+\.rar',x).group()) for x in filenames]
def read_and_save(in_path, db_path):
"reads xls data from files and saves them to database"
filenames = [os.path.join(in_path, name) for name in os.listdir(in_path) if name.endswith('xls')]
db_conn = sqlite3.connect(db_path)
db_cursor = db_conn.cursor()
def process_xls(file_path):
'Process one xls to db'
rb = xlrd.open_workbook(file_path, formatting_info=True)
sheet = rb.sheet_by_index(0)
table_name = sheet.row_values(0)[0].replace('\n','')
t = [table_name,]
t.extend(sheet.row_values(2))
t = [''.join(("'",item.replace('\n',''),"'")) for item in t]
t = tuple(t)
print 'xls: ',file_path
sql = "create table %s (%s text, %s text, %s text, %s text, %s text, %s text, %s text, %s text, %s text)"%t
try:
db_cursor.execute(sql)
db_conn.commit()
print 'Table created'
except:
print 'Table already exists'
i = 0
print '%i entrys to insert'%sheet.nrows
for rown in xrange(3, sheet.nrows):
values = [table_name,]
values.extend(sheet.row_values(rown))
values = [''.join(("'",item.replace('\n','').replace("'",'"'),"'")) for item in values]
values = tuple(values)
sql = "insert into %s values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"%values
db_cursor.execute(sql)
if i%100 == 0:
db_conn.commit()
if i%1000 == 0:
print '.',
i += 1
db_conn.commit()
print '%d entrys has been inserted succefuly'%i
for filename in filenames:
process_xls(filename)
db_conn.close()
def worker():
print 'Start'
get_archives(RAR_SAVE_PATH)
unpack_archives(RAR_SAVE_PATH, XLS_UNPACK_PATH)
read_and_save(XLS_UNPACK_PATH, DB_PATH)
print 'Done'
def main():
worker()
if __name__ == '__main__':
main()
Отредактировано (Окт. 28, 2010 19:55:12)
Офлайн
pill, привет,
pill
Зы: я жадный до экспы, потому если кто-то поругает (Стиль, код , структура все в копилку), буду очень признателен.
def main():
worker()
if __name__ == '__main__':
main()
if __name__ == '__main__':
worker()
pillЭто зависит от драйвера БД, к сожалению. Я использовал DB2 и соответственно, его DB-API2-драйвер.
Так и не разобрался почему вариант:
sql = “insert into ? values (?,?,?,?,?,?,?,?,?)”
db_cursor.execute(sql,values)
ругается на Syntax Error
pillUNIX only :(
import commands
Отредактировано (Окт. 29, 2010 11:17:18)
Офлайн
dvsДа, нужно избавляться от привычек
Си-шное наследие?
dvsВ документации sqlite3 подстановка описана.
Это зависит от драйвера БД, к сожалению. Я использовал DB2 и соответственно, его DB-API2-драйвер.
dvsда о мультиплатформенности не подумал…pillUNIX only :(
import commands
Офлайн