Прежние замечания пока не исправлены, но сейчас этот скрипт уже можно использовать для поставленной задачи.
Есть куча fb2 файлов. Добавлять всё это в calibre или fbreader и искать через них не хочу. Скрипт ищет ключевые слова по метаданным всех fb2 файлов и либо даёт их список в stdout, либо копирует их в созданную по такому случаю папку.
Может не окончательное, но решение.
import xml.etree.ElementTree as et
import xml # to handle xml exceptions
import argparse
import os
import shutil
import sys
import argparse
def check_elem (elem, target, field = None):
"""Return True if target string found in element children's text attr\
Used by check_descr functions"""
if target in str(elem.text).lower():
if not field:
return True
else:
if field in elem.tag.lower():
return True
else:
print ("close, but no")
else:
if len (elem)>0:
for child in elem:
x = check_elem(child, target, field=field)
if x:
return True
def check_descr (fname, target, field = None):
"""Returns True if target string found in description texts of fb2 file\
else returns False
fname - file name
target - string to find
field - field name (element.tag), default None (all tags)"""
tree = et.parse(fname)
root = tree.getroot()
#root[0] - description , root[1] - body
target = target.lower()
if field:
field = field.lower()
if len(root[0])>0:
#print ("Checking description for \'{}\'".format(target))
for thing in root[0]:
x=check_elem(thing, target=target, field=field)
if x:
return True
elif len(root[1])>0:
for thing in root[1]:
if "annotation" in thing.tag or "title" in thing.tag:
#print ("Checking annotation for \'{}\'".format(target))
for thing1 in thing:
x=check_elem(elem=thing1, target=target, field=field)
if x:
return True
else:
return False
def unroll (elem):
"""Ничего интересного. Шарит по элементам файла. Для отладки."""
print (elem.tag, elem.text)
for child in elem:
print (child.tag, child.text)
def check_fb2_files (path, keyword):
"""Return True if keyword found in metadata of fb2"""
os.chdir(path)
fb2_list = [item for item in os.listdir() if item.endswith('fb2')]
if len(fb2_list)==0 or type(fb2_list)==int:
#print ("No fb2 files found!")
return [],[]
result = 0 #found files count
found = []
bad_files = []
for item in fb2_list:
try:
if check_descr(item, keyword):
result=+1
#print ("{} - keyword found".format(item))
found.append(item)
if result>0:
pass
#print ("{} files found!".format(result))
else:
pass
#print ("Nothing here")
except xml.etree.ElementTree.ParseError:
bad_files.append(item)
#print ("bad file structure! \n --> {}".format(item))
return found, bad_files
def copy_found (files, folder):
"""Copy files to folder"""
for item in files:
shutil.copyfile(item, "./{0}/{1}".format(folder, item))
def print_found(message, files):
""" Print message and file list"""
print("\n\n"+message)
for item in files:
print (item)
def delete_found(files):
"""Delete bunch of files"""
for item in files:
os.remove(item)
def main():
parser=argparse.ArgumentParser(description="Search fb2 files and copy found to folder")
parser.add_argument("-p", "--path", action="store",\
help="path to folder with fb2 files", default=".", dest="path")
parser.add_argument("-k", "--key", action="store", help="keyword to search for",\
dest="keyword", required=True)
parser.add_argument("-c", "--copy", action="store_true", \
help="copy found files into \"keyword_result\" folder", dest="copy_found", default=False)
#parser.add_argument("-v", "--verbose", action="store_true", dest="verbose",\
#help="print additional report")
parser.add_argument("--clean", action="store_true", required=False,\
help="delete bad files, if such files found", dest="delete_bad_files")
args=parser.parse_args()
path=args.path
keyword=args.keyword
print ("Searching for keyword \'{}\' in folder \'{}\'".format(keyword, path))
found, bad = check_fb2_files(path, keyword)
if found:
print_found("Found {0} files, keyword \'{1}\'".format(len(found), keyword), found)
if args.copy_found:
folder="result_{}".format(keyword)
os.mkdir(folder)
copy_found(found, folder)
else:
print ("No files with \'{}\' keyword found".format(keyword))
if bad:
print_found("Found {0} bad formatted fb2 files".format(len(bad)), bad)
if args.delete_bad_files:
delete_found(bad)
return 0
if __name__ == '__main__':
main()