Python-сообщество

Casufi · Окт. 8, 2007 17:10:21

Добрый день.

Начал изучать Питон и для старта выбрал задачку - перекодировать все html файлы из каталога в кодировку, указанную в теге meta.
В результате получился вот такой скриптик.

#!/usr/local/bin/python

import os
import re
import chardet
import codecs
import types
from chardet.universaldetector import UniversalDetector

dir = ‘./’
detector = UniversalDetector()
htmlfindfile = re.compile(“^.*(\.html$|\.htm$)”)
findmeta = re.compile(“^<meta.* charset *= *”)
for root, dirs, files in os.walk(dir):
for name in files:
fullname = os.path.join(root, name)
oldcharset = “”
newcharset = “”
detected = 0
if htmlfindfile.search(fullname):
mypage = file(fullname, ‘rb’)
detector.close()
for line in mypage:
if detected == 0:
detector.feed(line)
if detector.done:
detected = 1
if findmeta.search(line):
newcharset = re.sub(“^<meta.* charset *= *”,“”,line)
newcharset = re.sub('" *.*$\n*','',newcharset)
detector.close()
mypage.close
oldcharset = detector.result
if (type(oldcharset) is types.StringType) and (type(newcharset) is types.StringType):
oldcharset = oldcharset.strip()
newcharset = newcharset.strip()
if (oldcharset != “”) and (newcharset != “”):
newpage = file(fullname, ‘rb’)
allnewpage = codecs.EncodedFile(newpage, oldcharset, newcharset)
c = allnewpage.readlines()
print fullname
print oldcharset
print newcharset

Сейчас немогу найти как записать данные из объекта allnewpage обратно в файл.
Кроме этого когда я делаю allnewpage.readlines() то получаю ошибку

Traceback (most recent call last):
  File "I:\portofolio\decode.py", line 40, in __main__
    c  = allnewpage.readlines()
  File "D:\Develop\Python24\Lib\codecs.py", line 592, in readlines
    data = self.reader.read()
  File "D:\Develop\Python24\Lib\codecs.py", line 293, in read
    newchars, decodedbytes = self.decode(data, self.errors)
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 3714-3715: invalid data

Все странички написаны в кодировке cp1251

Casufi · Окт. 9, 2007 12:46:57

Вот рабочий результат

#!/usr/local/bin/python

import os
import re
import chardet
import codecs
import types
from chardet.universaldetector import UniversalDetector

dir = ‘./’
detector = UniversalDetector()
htmlfindfile = re.compile(“^.*(\.html$|\.htm$)”)
findmeta = re.compile(“^<meta.* charset *= *”)
for root, dirs, files in os.walk(dir):
for name in files:
fullname = os.path.join(root, name)
oldcharset = “”
newcharset = “”
detected = 0
if htmlfindfile.search(fullname):
mypage = file(fullname, ‘rb’)
detector.close()
for line in mypage:
if detected == 0:
detector.feed(line)
if detector.done:
detected = 1
if findmeta.search(line):
newcharset = re.sub(“^<meta.* charset *= *”,“”,line)
newcharset = re.sub('" *.*$\n*','',newcharset)
detector.close()
mypage.close
oldcharset = detector.result
if (type(oldcharset) is types.StringType) and (type(newcharset) is types.StringType):
oldcharset = oldcharset.strip()
newcharset = newcharset.strip()
if (oldcharset != “”) and (newcharset != “”):
newpage = file(fullname, ‘rb’)
decodedpage = newpage.readlines()
newpage.close
decodedfile = file(fullname, ‘wb’)
for line in decodedpage:
newline = codecs.encode(codecs.decode(line,oldcharset),newcharset)
decodedfile.write(newline)
decodedfile.close()

print fullname
print oldcharset
print newcharset

Lolka · Окт. 10, 2007 22:52:52

Думаю, теперь Вы влюблены в Питон навсегда =)

Casufi · Окт. 11, 2007 20:59:26

Lolka
Думаю, теперь Вы влюблены в Питон навсегда =)

Питон это ХОРОШИЙ инструмент, очень хороший.

Python-сообщество

Уведомления

#1 Окт. 8, 2007 17:10:21

Как записать объект codecs.EncodedFile

#2 Окт. 9, 2007 12:46:57

Как записать объект codecs.EncodedFile

#3 Окт. 10, 2007 22:52:52

Как записать объект codecs.EncodedFile

#4 Окт. 11, 2007 20:59:26

Как записать объект codecs.EncodedFile

Board footer