hadoop: start-daemons.sh
[slapos.git] / software / hadoop-demo / wikipedia / reducer.py
1 #!/usr/bin/env python
2
3 import bz2
4 import os
5 import sys
6 import xml.sax
7
8 class WikipediaTitleHandler(xml.sax.ContentHandler):
9 def startElement(self, name, attrs):
10 self.chars = []
11 self.tag = name
12
13 def characters(self, content):
14 if self.tag == 'title':
15 self.chars.append(content)
16
17 def endElement(self, name):
18 if self.tag == 'title':
19 title = ''.join(self.chars)
20 if title.startswith('Talk:'):
21 return
22 if title.startswith('User talk:'):
23 return
24 if title.startswith('Wikipedia:'):
25 return
26 if title.startswith('Wikipedia talk:'):
27 return
28 if title.startswith('User:'):
29 return
30 print title.encode('utf8')
31
32
33
34
35
36 def process_xml(input):
37
38 parser = xml.sax.make_parser()
39 parser.setContentHandler(WikipediaTitleHandler())
40 parser.parse(input)
41
42
43
44
45 if __name__ == '__main__':
46 input = bz2.BZ2File('/dev/fd/0')
47 process_xml(input)
48
49
50
51 # dirname = '/srv/slapgrid/slappart20/srv/runner/instance/slappart0/software_release/raw-data/'
52 # filenames = os.listdir(dirname)
53 # # ['enwiki-20140203-pages-meta-current1.xml-p000000010p000010000.bz2']
54 # for fname in filenames:
55 # process_xml(os.path.join(dirname, fname))
56 # input = bz2.BZ2File(process_xml(os.path.join(dirname, fname)))
57
58
59
60
61
62