Как удалить дубликаты с помощью Mapreduce?
Итак, у меня есть этот код, который мне нужно изменить, чтобы я мог удалить дубликаты в моем наборе данных. Я понятия не имею, как это изменить. Мне не нужно разбивать правила, если я прав, я могу просто связать значение со строкой, верно? Я также не стал бы записывать значения как выходные данные. Помощь будет оценена!
(Это reducer.py)
#!/usr/bin/env python
# —-------------------------------------------------------------
#This reducer code will input a line of text and
# output <word, total-count>
# —-------------------------------------------------------------
import sys
last_key = None #initialize these variables
running_total = 0
# —---------------------------------
# Loop thru file
# —------------------------------
for input_line in sys.stdin:
input_line = input_line.strip()
# —------------------------------
# Get Next Word # —------------------------------
this_key, value = input_line.split("\t", 1) #the Hadoop default is tab separates key value
#the split command returns a list of strings, in this case into 2 variables
value = int(value) #int() will convert a string to integer (this program does no error checking)
# —-------------------------------
# Key Check part
# if this current key is same
# as the last one Consolidate
# otherwise Emit
# —-------------------------------
if last_key == this_key: #check if key has changed ('==' is
# logical equality check
running_total += value # add value to running total
else:
if last_key: #if this key that was just read in
# is different, and the previous
# (ie last) key is not empy,
# then output
# the previous <key running-count>
print( "{0}\t{1}".format(last_key, running_total) )
# hadoop expects tab(ie '\t')
# separation
running_total = value #reset values
last_key = this_key
if last_key == this_key:
print( "{0}\t{1}".format(last_key, running_total))
(Это mapper.py)
#!/usr/bin/env python
#the above just indicates to use python to intepret this file
# ---------------------------------------------------------------
#This mapper code will input a line of text and output <word, 1>
#
# ---------------------------------------------------------------
import sys #a python module with system functions for this OS
# ------------------------------------------------------------
# this 'for loop' will set 'line' to an input line from system
# standard input file
# ------------------------------------------------------------ for line in sys.stdin:
#-----------------------------------
#sys.stdin call 'sys' to read a line from standard input,
# note that 'line' is a string object, ie variable, and it has methods that you can apply to it,
# as in the next line
# ---------------------------------
line = line.strip() #strip is a method, ie function, associated
# with string variable, it will strip
# the carriage return (by default)
keys = line.split() #split line at blanks (by default),
# and return a list of keys
for key in keys: #a for loop through the list of keys
value = 1
print('{0}\t{1}'.format(key, value) ) #the {} is replaced by 0th,1st items in format list
#also, note that the Hadoop default is 'tab' separates key from the value