Найти - Пользователи
Полная версия: indexing in python
Начало » Python для новичков » indexing in python
1
aseka88
zdrastvuite,opyat!
i hotela izvinitsya za predydushyi vopros….
hotela prosto uznat,kak mozhno sdelat “indexing” v spiske?(ne znau,kak na russkom budet)a imenno:
mne dan spisok filmov v xml=> pri etom ya budu dolzhna podelit etu informaciu tak,chtoby informaciya o kazhdom filme byla v otdelnom file.
pozhaluista podskazhite kak byt dalshe……
vot,chto ya sdelala:
from lxml import etree
from collections import deque

import cPickle as pickle

tree = etree.parse( "movies_small.xml" )
root = tree.getroot()
movie_number = 0

def normalise(str):
return str.strip().lower()

def create_movie_dict(index, a_movie, movie_number):
mov_dict = {}
filename = "m"+str(movie_number)
for an_element in a_movie:
# key = an_element.tag
if an_element.tag == "title":
#building index for title
title_text = an_element.text
keywords = title_text.split()
for keyword in keywords:
print normalise(keyword)
if an_element.tag == "cast":
list_actors = []
for an_actor in an_element:
list_actors.append( an_actor.text )
mov_dict["cast"] = list_actors
elif an_element.tag == "reviews":
list_reviews = []
for a_review in an_element:
source = a_review.get("source")
score = a_review.get("source")
score = a_review.text
list_reviews.append( (source, score))
mov_dict["reviews"] = list_reviews
# value = list_reviews
#print an_element.tag
else:
mov_dict[an_element.tag] = an_element.text
# value = an_element.text
# a_movie_dict[key] = value
# return mov_dict
#print mov_dict
file = open('data/'+ filename, 'w')
pickle.dump(mov_dict, file)
file.close()

index = {}

for a_movie in root:
create_movie_dict(index, a_movie, movie_number)
movie_number += 1


# pickle index <= NE MOGI RAZOBRATSA KAK PRAVILNO VSE SDELAT
file = open('data/'+ filename, 'w')
pickle.dump(mov_dict, file)
file.close()
aseka88
p.s proshu prosheniya chto ne mogu pisat na russkom- net klaviatury (t.e klaviatura na angliiskom=> i na russkom ochennnn dolgo pisat)
PooH
http://www.translit.ru/
aseka88
ludi,sorry konechno,no neuzheli tak slozhno ponyat chto ya napisala?prosto k vam normalno obrashaeshsya=> a vy link skidyvaete vmesto togo,chtoby pomoch'….
PooH
уломали :) дайте пример xml файла
aseka88
<?xml version=“1.0” encoding=“UTF-8”?>
<!DOCTYPE movielist [
<!ELEMENT movielist (movie*)>
<!ELEMENT movie (title, rating, studio, genre, cast, writer, director, date, origin, reviews, earnings)>
<!ELEMENT title (#PCDATA)>
<!ELEMENT rating (#PCDATA)>
<!ELEMENT studio (#PCDATA)>
<!ELEMENT genre (#PCDATA)>
<!ELEMENT cast (actor*)>
<!ELEMENT writer (#PCDATA)>
<!ELEMENT director (#PCDATA)>
<!ELEMENT date (#PCDATA)>
<!ELEMENT origin (#PCDATA)>
<!ELEMENT reviews (review+)>
<!ELEMENT earnings (#PCDATA)>
<!ELEMENT actor (#PCDATA)>
<!ELEMENT review (#PCDATA)>
<!ATTLIST review source CDATA #REQUIRED>
]>
<movielist>
<movie>
<title>Harry Potter and the Goblet of Fire</title>
<rating>PG</rating>
<studio>Warner Bros.</studio>
<genre>Adventure</genre>
<cast>
<actor>Daniel Radcliffe</actor>
<actor>Emma Watson</actor>
<actor>Rupert Grint</actor>
</cast>
<writer>Steven Kloves</writer>
<director>Mike Newell</director>
<date>2006</date>
<origin>UK / USA</origin>
<reviews>
<review source=“Washington Post”>80</review>
<review source=“Chicago Sun-Times”>88</review>
<review source=“The New York Times”>80</review>
<review source=“LA Weekly”>60</review>
<review source=“Los Angeles Times”>90</review>
<review source=“Rolling Stone”>75</review>
<review source=“Wall Street Journal”>90</review>
<review source=“Entertainment Weekly”>67</review>
<review source=“Empire”>60</review>
<review source=“Variety”>90</review>
<review source=“Salon.com”>90</review>
<review source=“The Onion (A.V. Club)”>80</review>
<review source=“TV Guide”>75</review>
<review source=“Slate”>100</review>
<review source=“metascore”>81</review>
</reviews>
<earnings>892194397</earnings>
</movie>
<movie>
<title>Shrek the Third</title>
<rating>PG</rating>
<studio>DreamWorks Distribution</studio>
<genre>Animation</genre>
<cast>
<actor>Mike Myers</actor>
<actor>Eddie Murphy</actor>
<actor>Cameron Diaz</actor>
</cast>
<writer>Jeffrey Price</writer>
<director>Chris Miller</director>
<date>2007</date>
<origin>USA</origin>
<reviews>
<review source=“Washington Post”>50</review>
<review source=“Chicago Sun-Times”>63</review>
<review source=“The New York Times”>80</review>
<review source=“Los Angeles Times”>50</review>
<review source=“Rolling Stone”>50</review>
<review source=“Wall Street Journal”>50</review>
<review source=“Entertainment Weekly”>67</review>
<review source=“Empire”>40</review>
<review source=“Variety”>80</review>
<review source=“Salon.com”>70</review>
<review source=“The Onion (A.V. Club)”>25</review>
<review source=“TV Guide”>50</review>
<review source=“metascore”>58</review>
</reviews>
<earnings>791106665</earnings>
</movie>



i SPASIBO OGROMNOE!!!!!!!!!!!!!=)
PooH
Ну если я правильно понял что вам нужно:
from lxml import etree
import cPickle as pickle

def normalize(str):
return str.strip().lower()

def parse_movie(movie):
keywords, data = [], {}
for element in movie:
if element.tag == "title":
title = element.text
keywords = [normalize(x) for x in title.split()]
data["title"] = title
elif element.tag == "cast":
data["cast"] = [x.text for x in element]
elif element.tag == "reviews":
data["reviews"] = [(x.get("source", x.get("score"))) for x in element]
else:
data[element.tag] = element.text
return keywords, data

tree = etree.parse( "test.xml" )
root = tree.getroot()
index = {}
for n, movie in enumerate(root):
filename = "m%d.dat" % n
keywords, data = parse_movie(movie)
pickle.dump(data, open("data/%s" % filename, 'w'))
for key in keywords:
entry = index.setdefault(key, [])
entry.append(filename)
pickle.dump(index, open('data/index.dat', 'w'))
В index.dat индекс файлов в виде словаря ‘ключевое слово’ - ‘список имен файлов’
pyuser
Почему-то мне думается, что строка:
            data["reviews"] = [(x.get("source", x.get("score"))) for x in element]
должна выглядеть так:
            data["reviews"] = [(x.get("source"), x.text) for x in element]
PooH
pyuser
Почему-то мне думается, что строка:
Правильно думается :) не обратил внимания :(
aseka88
RAHMET OGROMNYI!!!!!!!!!!!!!!!!!!!!!!
This is a "lo-fi" version of our main content. To view the full version with more information, formatting and images, please click here.
Powered by DjangoBB