用python写经典的wordcount map/reduce:
[hadoop@SN2008-04-003 temp]$ cat mapper.py
#!/bin/env python
import sys
for line in sys.stdin:
line = line.strip()
words = line.split()
for word in words:
print '%s\t%s' % (word,1)
[hadoop@SN2008-04-003 temp]$ cat reducer.py
#!/bin/env python
import sys
mydict = dict()
for line in sys.stdin:
(word,cnt) = line.strip().split('\t')
mydict[word] = mydict.get(word,0) 1
for word,cnt in mydict.items():
print word,cnt
[hadoop@SN2008-04-003 temp]$
[hadoop@SN2008-04-003 hadoop]$ hadoop fs -mkdir tst
[hadoop@SN2008-04-003 hadoop]$ hadoop fs -put /etc/passwd tst
[hadoop@SN2008-04-003 hadoop]$ hadoop fs -ls tst/passwd
Found 1 items
-rw-r--r-- 1 hadoop supergroup 1689 2012-10-12 10:28 /user/hadoop/tst/passwd
[hadoop@SN2008-04-003 hadoop]$ hadoop jar contrib/streaming/hadoop-0.20.2-streaming.jar -file ../temp/mapper.py -mapper ../temp/mapper.py -file ../temp/reducer.py -reducer ../temp/reducer.py -input ./tst/ -output ./output/
packageJobJar: [../temp/mapper.py, ../temp/reducer.py, /tmp/hadoop-hadoop/hadoop-unjar115546496688852664/] [] /tmp/streamjob3277639832077031630.jar tmpDir=null
12/10/12 10:29:09 INFO mapred.FileInputFormat: Total input paths to process : 1
12/10/12 10:29:09 INFO streaming.StreamJob: getLocalDirs(): [/tmp/hadoop-hadoop/mapred/local]
12/10/12 10:29:09 INFO streaming.StreamJob: Running job: job_201209181743_0002
12/10/12 10:29:09 INFO streaming.StreamJob: To kill this job, run:
12/10/12 10:29:09 INFO streaming.StreamJob: /home/hadoop/hadoop/bin/../bin/hadoop job -Dmapred.job.tracker=19.2.174.7:9011 -kill job_201209181743_0002
12/10/12 10:29:09 INFO streaming.StreamJob: Tracking URL:
12/10/12 10:29:10 INFO streaming.StreamJob: map 0% reduce 0%
12/10/12 10:29:16 INFO streaming.StreamJob: map 100% reduce 0%
12/10/12 10:29:28 INFO streaming.StreamJob: map 100% reduce 100%
12/10/12 10:29:31 INFO streaming.StreamJob: Job complete: job_201209181743_0002
12/10/12 10:29:31 INFO streaming.StreamJob: Output: ./output/
[hadoop@SN2008-04-003 hadoop]$ hadoop fs -lsr output
drwxr-xr-x - hadoop supergroup 0 2012-10-12 10:29 /user/hadoop/output/_logs
drwxr-xr-x - hadoop supergroup 0 2012-10-12 10:29 /user/hadoop/output/_logs/history
-rw-r--r-- 1 hadoop supergroup 17188 2012-10-12 10:29 /user/hadoop/output/_logs/history/SN2008-04-003_1347961409932_job_201209181743_0002_conf.xml
-rw-r--r-- 1 hadoop supergroup 9091 2012-10-12 10:29 /user/hadoop/output/_logs/history/SN2008-04-003_1347961409932_job_201209181743_0002_hadoop_streamjob3277639832077031630.jar
-rw-r--r-- 1 hadoop supergroup 1828 2012-10-12 10:29 /user/hadoop/output/part-00000
[hadoop@SN2008-04-003 hadoop]$ hadoop fs -cat output/part-00000
dbus:x:81:81:System 1
haldaemon:x:68:68:HAL 1
#lp:x:4:7:lp:/var/spool/lpd:/sbin/nologin 1
avahi:x:70:70:Avahi 1
vcsa:x:69:69:virtual 1
#uucp:x:10:14:uucp:/var/spool/uucp:/sbin/nologin 1
postfix:x:89:89::/var/spool/postfix:/sbin/nologin 1
User:/var/lib/nfs:/sbin/nologin 2
...
==over==
阅读(2608) | 评论(0) | 转发(0) |