I'm having trouble getting dedupe to run. I am trying to use this library to remove duplicates from a huge set of addresses. Here is my code:
import collections
import logging
import optparse
from numpy import nan
import dedupe
from unidecode import unidecode
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)
input_file = 'H:/My Documents/Python Scripts/Dedupe/DupeTester.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'
def preProcess(column):
import unidecode
column = column.decode("utf8")
column = unidecode.unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
return column
def readData(filename):
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row[''])
data_d[row_id] = dict(clean_row)
return data_d
print 'importing data ...'
data_d = readData(input_file)
if os.path.exists(settings_file):
print 'reading from', settings_file
with open(settings_file, 'rb') as f:
deduper = dedupe.StaticDedupe(f)
else:
fields = [
{"field" : "fulladdr", "type" : "Address"},
{"field" : "zip", "type" : "ShortString"},
]
deduper = dedupe.Dedupe(fields)
deduper.sample(data_d, 200)
if os.path.exists(training_file):
print 'reading labeled examples from ', training_file
with open(training_file, 'rb') as f:
deduper.readTraining(f)
print 'starting active labeling...'
dedupe.consoleLabel(deduper)
deduper.train()
with open(training_file, 'w') as tf :
deduper.writeTraining(tf)
with open(settings_file, 'w') as sf :
deduper.writeSettings(sf)
print 'blocking...'
threshold = deduper.threshold(data_d, recall_weight=2)
print 'clustering...'
clustered_dupes = deduper.match(data_d, threshold)
print '# duplicate sets', len(clustered_dupes)
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
id_set, scores = cluster
cluster_d = [data_d[c] for c in id_set]
canonical_rep = dedupe.canonicalize(cluster_d)
for record_id, score in zip(id_set, scores) :
cluster_membership[record_id] = {
"cluster id" : cluster_id,
"canonical representation" : canonical_rep,
"confidence": score
}
singleton_id = cluster_id + 1
with open(output_file, 'w') as f_output:
writer = csv.writer(f_output)
with open(input_file) as f_input :
reader = csv.reader(f_input)
heading_row = reader.next()
heading_row.insert(0, 'confidence_score')
heading_row.insert(0, 'Cluster ID')
canonical_keys = canonical_rep.keys()
for key in canonical_keys:
heading_row.append('canonical_' + key)
writer.writerow(heading_row)
for row in reader:
row_id = int(row[0])
if row_id in cluster_membership :
cluster_id = cluster_membership[row_id]["cluster id"]
canonical_rep = cluster_membership[row_id]["canonical representation"]
row.insert(0, cluster_membership[row_id]['confidence'])
row.insert(0, cluster_id)
for key in canonical_keys:
row.append(canonical_rep[key].encode('utf8'))
else:
row.insert(0, None)
row.insert(0, singleton_id)
singleton_id += 1
for key in canonical_keys:
row.append(None)
writer.writerow(row)
Specifically I am getting the following when I run it:
C:\Anaconda\lib\site-packages\dedupe\core.py:18: UserWarning: There may be duplicates in the sample
warnings.warn("There may be duplicates in the sample")
Traceback (most recent call last):
File "<ipython-input-1-33e46d604c5f>", line 1, in <module>
runfile('H:/My Documents/Python Scripts/Dedupe/dupetestscript.py', wdir='H:/My Documents/Python Scripts/Dedupe')
File "C:\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
execfile(filename, namespace)
File "H:/My Documents/Python Scripts/Dedupe/dupetestscript.py", line 67, in <module>
deduper.sample(data_d, 200)
File "C:\Anaconda\lib\site-packages\dedupe\api.py", line 924, in sample
random_sample_size))
TypeError: unhashable type: 'numpy.ndarray'
A numpy array can be changed (it is "mutable"). Python speeds up dictionary access by using the hash value of the key instead of the key.
So only hashable objects like numbers, strings or tuples can be used keys in a dictionary. From the Python glossary definition of hashable: