mapper.py
#!/usr/bin/env python """A more advanced Mapper, using Python iterators and generators.""" import sys def read_input(file): for line in file: # split the line into words yield line.split() def main(separator=" "): # input comes from STDIN (standard input) data = read_input(sys.stdin) for words in data: # write the results to STDOUT (standard output); # what we output here will be the input for the # Reduce step, i.e. the input for reducer.py # # tab-delimited; the trivial word count is 1 for word in words: print "%s%s%d" % (word, separator, 1) if __name__ == "__main__": main()reducer.py
#!/usr/bin/env python """A more advanced Reducer, using Python iterators and generators.""" from itertools import groupby from operator import itemgetter import sys def read_mapper_output(file, separator=" "): for line in file: yield line.rstrip().split(separator, 1) def main(separator=" "): # input comes from STDIN (standard input) data = read_mapper_output(sys.stdin, separator=separator) # groupby groups multiple word-count pairs by word, # and creates an iterator that returns consecutive keys and their group: # current_word - string containing a word (the key) # group - iterator yielding all ["", " "] items for current_word, group in groupby(data, itemgetter(0)): try: total_count = sum(int(count) for current_word, count in group) print "%s%s%d" % (current_word, separator, total_count) except ValueError: # count was not a number, so silently discard this item pass if __name__ == "__main__": main()
转自:http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
文章版权归作者所有,未经允许请勿转载,若此文章存在违规行为,您可以联系管理员删除。
转载请注明本文地址:https://www.ucloud.cn/yun/45308.html
摘要: Caching Libraries for caching data. Beaker - A library for caching and sessions for use with web applications and stand-alone Python scripts and applications. dogpile.cache - dogpile.cache...
阅读 3047·2021-11-24 10:34
阅读 3303·2021-11-22 13:53
阅读 2598·2021-11-22 12:03
阅读 3584·2021-09-26 09:47
阅读 2987·2021-09-23 11:21
阅读 4702·2021-09-22 15:08
阅读 3267·2021-07-23 10:59
阅读 1236·2019-08-29 18:31