Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

csv_example.py - program terminated without error message #128

Open
surianisha opened this issue Jun 10, 2022 · 1 comment
Open

csv_example.py - program terminated without error message #128

surianisha opened this issue Jun 10, 2022 · 1 comment

Comments

@surianisha
Copy link

Couldn't find 'csv_example_training.json' in the repo, so used 'csv_input_with_true_ids.csv'. There was no setting file either so couldn't use that (commented out in code as shared below).
Made sure to use consoleLabel() instead of console_label().

Followed the steps in csv_example.py. Active learning got initiated but the program terminates without error message.
Screen Shot 2022-06-09 at 7 59 19 PM

The code is below:
##################################################
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

def preProcess(column):

column = unidecode(column)
column = re.sub('  +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()


if not column:
    column = None
return column

def readData(filename):

data_d = {}
with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
        clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
        row_id = int(row['Id'])
        data_d[row_id] = dict(clean_row)
        
return data_d

example

path = '/Users/asuri/Downloads/dedupe-examples-master/csv_example/'
filename = 'csv_example_messy_input.csv'

#######################################

if name == 'main':

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = path + filename
output_file = path + 'output.csv'
#settings_file = 'csv_example_learned_settings'
training_file = path + 'csv_input_with_true_ids.csv'

print('importing data ...')
data_d = readData(input_file)
    

    

fields = [
        {'field': 'Site name', 'type': 'String'},
        {'field': 'Address', 'type': 'String'},
        {'field': 'Zip', 'type': 'Exact', 'has missing': True},
        {'field': 'Phone', 'type': 'String', 'has missing': True},
        ]


    
deduper = dedupe.Dedupe(fields)

if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file, 'rb') as f:
        deduper.prepare_training(data_d,f)
else:
    deduper.prepare_training(data_d)
    
print('starting active labeling...')

#as of 2.0 this method is called console_label() but in 1.x it was called consoleLabel(), that difference may account for the error. Now updated to consoleLabel

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf:
    deduper.write_training(tf)

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)
print('# duplicate sets', len(clustered_dupes))

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score
}

with open(output_file, 'w') as f_output, open(input_file) as f_input:

reader = csv.DictReader(f_input)
fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()

for row in reader:
    row_id = int(row['id'])
    row.update(cluster_membership[row_id])
    writer.writerow(row)
@surianisha
Copy link
Author

Internal error details:

Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.9/site-packages/qtconsole/base_frontend_mixin.py", line 138, in _dispatch
handler(msg)
File "/opt/anaconda3/lib/python3.9/site-packages/spyder/plugins/ipythonconsole/widgets/debugging.py", line 278, in _handle_input_request
return super(DebuggingWidget, self)._handle_input_request(msg)
File "/opt/anaconda3/lib/python3.9/site-packages/qtconsole/frontend_widget.py", line 512, in _handle_input_request
self._readline(msg['content']['prompt'], callback=callback, password=msg['content']['password'])
File "/opt/anaconda3/lib/python3.9/site-packages/qtconsole/console_widget.py", line 2422, in _readline
self._show_prompt(prompt, newline=False, separator=False)
TypeError: _show_prompt() got an unexpected keyword argument 'separator'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant