How to use the dedupe.blocking.Blocker function in dedupe

To help you get started, we’ve selected a few dedupe examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dedupeio / dedupe / tests / test_blocking.py View on Github external
def test_unconstrained_inverted_index(self):

    blocker = dedupe.blocking.Blocker([dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")])

    blocker.index(set(record["name"] 
                           for record 
                           in viewvalues(self.data_d)),
                       "name")

    blocks = defaultdict(set)
    
    for block_key, record_id in blocker(self.data_d.items()) :
      blocks[block_key].add(record_id)

    blocks = set([frozenset(block) for block in blocks.values()
                  if len(block) > 1 ])

    assert blocks ==\
        set([frozenset([120, 125]), frozenset([130, 135])])
github entrepreneur-interet-general / the-magical-csv-merge-machine / merge_machine / old_code / test_learning.py View on Github external
assert False


#candidates = [(source[k1], ref[k2])
#               for k1, k2
#               in blocked_sample_keys | random_sample_keys]

#candidates = [(source_items[k1], ref_items[k2])
#               for predicate, (k1, k2)
#               in blocked_sample_keys]


compound_length = 1

blocker = blocking.Blocker(my_predicates)

#blocker.indexAll({i : record
#                       for i, record
#                       in enumerate(unroll(candidates))})
#    
#dupe_cover = cover(blocker, candidates, compound_length)
#

#
#tab = pd.DataFrame([[key, val] for key, val in dupe_cover_count.items()], columns=['predicate', 'count'])
#tab['col'] = tab.predicate.apply(get_col)
#tab.groupby('col')['count'].mean() # meaningfull columns

def invert_dupe_cover(dupe_cover):
    inv_dupe_cover = defaultdict(set)
    for key, matches in dupe_cover.items():
github dedupeio / dedupe / tests / test_blocking.py View on Github external
def test_unconstrained_inverted_index(self):

        blocker = dedupe.blocking.Blocker(
            [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")])

        blocker.index(set(record["name"]
                          for record
                          in viewvalues(self.data_d)),
                      "name")

        blocks = defaultdict(set)

        for block_key, record_id in blocker(self.data_d.items()):
            blocks[block_key].add(record_id)

        blocks = set([frozenset(block) for block in blocks.values()
                      if len(block) > 1])

        assert blocks ==\
github dedupeio / dedupe / tests / test_blocking.py View on Github external
def test_dedupe_coverage(self) :
    predicates = self.data_model.predicates()
    blocker = dedupe.blocking.Blocker(predicates)
    dedupe.training.prepare_index(blocker, self.training_pairs, "Dedupe")

    coverage = dedupe.training.coveredBy(blocker.predicates, self.training)
    assert self.simple(coverage.keys()).issuperset(
          set(["SimplePredicate: (tokenFieldPredicate, name)", 
               "SimplePredicate: (commonSixGram, name)", 
               "TfidfTextCanopyPredicate: (0.4, name)", 
               "SimplePredicate: (sortedAcronym, name)",
               "SimplePredicate: (sameThreeCharStartPredicate, name)", 
               "TfidfTextCanopyPredicate: (0.2, name)", 
               "SimplePredicate: (sameFiveCharStartPredicate, name)", 
               "TfidfTextCanopyPredicate: (0.6, name)", 
               "SimplePredicate: (wholeFieldPredicate, name)", 
               "TfidfTextCanopyPredicate: (0.8, name)", 
               "SimplePredicate: (commonFourGram, name)", 
               "SimplePredicate: (firstTokenPredicate, name)",
github dedupeio / dedupe / tests / test_blocking.py View on Github external
def setUp(self) :
    data_d = {
      100 : {"name": "Bob", "age": "50", "dataset": 0},
      105 : {"name": "Charlie", "age": "75", "dataset": 1},
      110 : {"name": "Meredith", "age": "40", "dataset": 1},
      115 : {"name": "Sue", "age": "10", "dataset": 0},
      120 : {"name": "Jimbo", "age": "21","dataset": 0},
      125 : {"name": "Jimbo", "age": "21", "dataset": 0},
      130 : {"name": "Willy", "age": "35", "dataset": 0},
      135 : {"name": "Willy", "age": "35", "dataset": 1},
      140 : {"name": "Martha", "age": "19", "dataset": 1},
      145 : {"name": "Kyle", "age": "27", "dataset": 0},
    }


    self.blocker = dedupe.blocking.Blocker([dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")])

    self.records_1 = dict((record_id, record) 
                          for record_id, record 
                          in viewitems(data_d)
                          if record["dataset"] == 0)

    self.fields_2 = dict((record_id, record["name"])
                         for record_id, record 
                         in viewitems(data_d)
                         if record["dataset"] == 1)
github dedupeio / dedupe / tests / test_training.py View on Github external
def test_dedupe_coverage(self):
        predicates = self.data_model.predicates()
        blocker = dedupe.blocking.Blocker(predicates)
        blocker.indexAll({i: x for i, x in enumerate(self.training_records)})
        coverage = training.Cover(blocker.predicates,
                                  self.training)
        assert self.simple(coverage.keys()).issuperset(
            set(["SimplePredicate: (tokenFieldPredicate, name)",
                 "SimplePredicate: (commonSixGram, name)",
                 "TfidfTextCanopyPredicate: (0.4, name)",
                 "SimplePredicate: (sortedAcronym, name)",
                 "SimplePredicate: (sameThreeCharStartPredicate, name)",
                 "TfidfTextCanopyPredicate: (0.2, name)",
                 "SimplePredicate: (sameFiveCharStartPredicate, name)",
                 "TfidfTextCanopyPredicate: (0.6, name)",
                 "SimplePredicate: (wholeFieldPredicate, name)",
                 "TfidfTextCanopyPredicate: (0.8, name)",
                 "SimplePredicate: (commonFourGram, name)",
                 "SimplePredicate: (firstTokenPredicate, name)",
github dedupeio / dedupe / dedupe / api.py View on Github external
def _trainBlocker(self, maximum_comparisons, recall, index_predicates):  # pragma: no cover
        matches = self.training_pairs['match'][:]

        predicate_set = self.data_model.predicates(index_predicates,
                                                   self.canopies)

        block_learner = self._blockLearner(predicate_set)

        self.predicates = block_learner.learn(matches,
                                              maximum_comparisons,
                                              recall)

        self.blocker = blocking.Blocker(self.predicates)
github dedupeio / dedupe / dedupe / training.py View on Github external
def __init__(self, predicates, sampled_records, data):

        compound_length = 2

        N = sampled_records.original_length
        N_s = len(sampled_records)

        self.r = (N * (N - 1)) / (N_s * (N_s - 1))

        self.blocker = blocking.Blocker(predicates)
        self.blocker.indexAll(data)

        simple_cover = self.coveredPairs(self.blocker, sampled_records)
        compound_predicates = self.compound(simple_cover, compound_length)
        self.comparison_count = self.comparisons(compound_predicates,
                                                 simple_cover)
github dedupeio / dedupe / dedupe / blocking.py View on Github external
def canopyOverlap(self,
                       tfidf_predicates,
                       record_pairs) :

        # uniquify records
        docs = list(set(itertools.chain(*record_pairs)))
        id_records = list(itertools.izip(itertools.count(), docs))
        record_ids = dict(itertools.izip(docs, itertools.count()))


        blocker = Blocker()
        blocker.tfidf_predicates = tfidf_predicates
        blocker.tfIdfBlocks(id_records)

        for (threshold, field) in blocker.tfidf_predicates:
            canopy = blocker.canopies[threshold.__name__ + field]
            for record_1, record_2 in record_pairs :
                id_1 = record_ids[record_1]
                id_2 = record_ids[record_2]
                if canopy[id_1] == canopy[id_2]:
                    self.overlapping[(threshold, field)].add((record_1, record_2))
                    self.blocks[(threshold, field)][canopy[id_1]].add((record_1, record_2))