8 class WikipediaTitleHandler(xml
.sax
.ContentHandler
):
9 def startElement(self
, name
, attrs
):
13 def characters(self
, content
):
14 if self
.tag
== 'title':
15 self
.chars
.append(content
)
17 def endElement(self
, name
):
18 if self
.tag
== 'title':
19 title
= ''.join(self
.chars
)
20 if title
.startswith('Talk:'):
22 if title
.startswith('User talk:'):
24 if title
.startswith('Wikipedia:'):
26 if title
.startswith('Wikipedia talk:'):
28 if title
.startswith('User:'):
30 print title
.encode('utf8')
36 def process_xml(input):
38 parser
= xml
.sax
.make_parser()
39 parser
.setContentHandler(WikipediaTitleHandler())
45 if __name__
== '__main__':
46 input = bz2
.BZ2File('/dev/fd/0')
51 # dirname = '/srv/slapgrid/slappart20/srv/runner/instance/slappart0/software_release/raw-data/'
52 # filenames = os.listdir(dirname)
53 # # ['enwiki-20140203-pages-meta-current1.xml-p000000010p000010000.bz2']
54 # for fname in filenames:
55 # process_xml(os.path.join(dirname, fname))
56 # input = bz2.BZ2File(process_xml(os.path.join(dirname, fname)))