Python - Trouble with Dedupe: TypeError: unhashable type: 'numpy.ndarray'

3.2k Views Asked by At

I'm having trouble getting dedupe to run. I am trying to use this library to remove duplicates from a huge set of addresses. Here is my code:

import collections
import logging
import optparse
from numpy import nan

import dedupe
from unidecode import unidecode

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING 
if opts.verbose == 1:
    log_level = logging.INFO
elif opts.verbose >= 2:
    log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = 'H:/My Documents/Python Scripts/Dedupe/DupeTester.csv'
output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

def preProcess(column):

    import unidecode
    column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column

def readData(filename):
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row[''])
            data_d[row_id] = dict(clean_row)

    return data_d


print 'importing data ...'
data_d = readData(input_file)

if os.path.exists(settings_file):
    print 'reading from', settings_file
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)

else:
    fields = [
        {"field" : "fulladdr", "type" : "Address"},
        {"field" : "zip", "type" : "ShortString"},
             ]

deduper = dedupe.Dedupe(fields)

deduper.sample(data_d, 200)

if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

print 'starting active labeling...'

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf :
        deduper.writeTraining(tf)

with open(settings_file, 'w') as sf :
        deduper.writeSettings(sf)

print 'blocking...'



threshold = deduper.threshold(data_d, recall_weight=2)



print 'clustering...'
clustered_dupes = deduper.match(data_d, threshold)

print '# duplicate sets', len(clustered_dupes)




cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores) :
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output:
    writer = csv.writer(f_output)

    with open(input_file) as f_input :
        reader = csv.reader(f_input)

        heading_row = reader.next()
        heading_row.insert(0, 'confidence_score')
        heading_row.insert(0, 'Cluster ID')
        canonical_keys = canonical_rep.keys()
        for key in canonical_keys:
            heading_row.append('canonical_' + key)

        writer.writerow(heading_row)

        for row in reader:
            row_id = int(row[0])
            if row_id in cluster_membership :
                cluster_id = cluster_membership[row_id]["cluster id"]
                canonical_rep = cluster_membership[row_id]["canonical representation"]
                row.insert(0, cluster_membership[row_id]['confidence'])
                row.insert(0, cluster_id)
                for key in canonical_keys:
                    row.append(canonical_rep[key].encode('utf8'))
            else:
                row.insert(0, None)
                row.insert(0, singleton_id)
                singleton_id += 1
                for key in canonical_keys:
                    row.append(None)
            writer.writerow(row)

Specifically I am getting the following when I run it:

C:\Anaconda\lib\site-packages\dedupe\core.py:18: UserWarning: There may be duplicates in the sample
  warnings.warn("There may be duplicates in the sample")
Traceback (most recent call last):

  File "<ipython-input-1-33e46d604c5f>", line 1, in <module>
    runfile('H:/My Documents/Python Scripts/Dedupe/dupetestscript.py', wdir='H:/My Documents/Python Scripts/Dedupe')

  File "C:\Anaconda\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 580, in runfile
    execfile(filename, namespace)

  File "H:/My Documents/Python Scripts/Dedupe/dupetestscript.py", line 67, in <module>
    deduper.sample(data_d, 200)

  File "C:\Anaconda\lib\site-packages\dedupe\api.py", line 924, in sample
    random_sample_size))

TypeError: unhashable type: 'numpy.ndarray'
1

There are 1 best solutions below

2
On

A numpy array can be changed (it is "mutable"). Python speeds up dictionary access by using the hash value of the key instead of the key.

So only hashable objects like numbers, strings or tuples can be used keys in a dictionary. From the Python glossary definition of hashable:

An object is hashable if it has a hash value which never changes during its lifetime (it needs a __hash__() method), and can be compared to other objects (it needs an __eq__() method). Hashable objects which compare equal must have the same hash value.

Hashability makes an object usable as a dictionary key and a set member, because these data structures use the hash value internally.

All of Python’s immutable built-in objects are hashable, while no mutable containers (such as lists or dictionaries) are. Objects which are instances of user-defined classes are hashable by default; they all compare unequal (except with themselves), and their hash value is derived from their id().