summary |
shortlog |
changelog |
graph |
tags |
bookmarks |
branches |
files |
changeset |
file |
latest |
revisions |
annotate |
diff |
comparison |
raw |
help

minhashing.py

author | Vincent Michel <vincent.michel@logilab.fr> |

Tue, 15 Oct 2013 09:03:13 +0000 | |

changeset 309 | abb55b01503f |

parent 306 | cf1e78baf803 |

permissions | -rw-r--r-- |

[data] Add reference data, see #183444

# -*- coding:utf-8 -*- # copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved. # contact http://www.logilab.fr -- mailto:contact@logilab.fr # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Lesser General Public License as published by the Free # Software Foundation, either version 2.1 of the License, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more # details. # # You should have received a copy of the GNU Lesser General Public License along # with this program. If not, see <http://www.gnu.org/licenses/>. import cPickle from random import randint from collections import defaultdict import numpy as np from scipy.optimize import bisect from nazca.normalize import iter_wordgrams def randomhashfunction(zr): """ Return a random hash function, mapping x in Z to ZR h:x -> ax + b mod R """ bound = max(zr - 1, 1) a = randint(1, bound) b = randint(1, bound) def hashfunc(x): return ((a*x + b)%zr) return hashfunc class Minlsh(object): """ Operate minhashing + locally-sensitive-hashing to find similar sentences """ def __init__(self, verbose=False): self._trained = False self.sigmatrix = None self._verbose = verbose def train(self, sentences, k=2, siglen=200): """ Train the minlsh on the given sentences. - `k` is the length of the k-wordgrams used (the lower k is, the faster is the training) - `siglen` the length of the sentences signature """ rows, shape = self._buildmatrixdocument(sentences, k) if self._verbose: print "Training is done. Wait while signaturing" self._computesignaturematrix(rows, shape, siglen) self._trained = True def _buildmatrixdocument(self, sentences, k): """ Return a sparse matrix where : - Each sentence is a column - Each row is a element of the universal set Each value (r, c) is set to 1 if the element at row r is in the sentence c, 0 otherwise """ rows, universe, sizeofuniverse = [], {}, 0 for nb, sent in enumerate(sentences): row = [] for w in iter_wordgrams(sent, k): row.append(universe.setdefault(w, sizeofuniverse)) if row[-1] == sizeofuniverse: sizeofuniverse += 1 rows.append(row) if self._verbose and nb % 50000 == 0: print nb return rows, (len(rows), sizeofuniverse) def _computesignaturematrix(self, rows, shape, siglen): """ Return a matrix where each column is the signature the document The signature is composed of `siglen` numbers The more the documents have rows in commun, the closer they are. """ nrows, ncols = shape sig = np.empty((siglen, nrows)) #Generate the random hash functions hashfunc = [randomhashfunction(ncols) for _ in xrange(siglen)] #Compute hashing values just for once. #Avoid multiple recomputations for the same column. hashvalues = np.array([[hashfunc[i](r) for r in xrange(ncols)] for i in xrange(siglen)]) docind = 0 while rows: doc = rows.pop(0) #Concatenate the needed rows. tmp = np.dstack([hashvalues[:, r] for r in doc]) #Take the mininum of hashes sig[:, docind] = np.min(tmp[0], 1) docind += 1 if self._verbose and docind % 50000 == 0: print (docind * 100) / nrows self.sigmatrix = sig def save(self, savefile): """ Save the training into `savefile` for a future use """ if not self._trained: print "Not trained, nothing to save" return with open(savefile, 'wb') as fobj: pickler = cPickle.Pickler(fobj) pickler.dump(self.sigmatrix) def load(self, savefile): """ Load a trained minhashing """ with open(savefile, 'rb') as fobj: pickler = cPickle.Unpickler(fobj) self.sigmatrix = pickler.load() if self.sigmatrix is not None: self._trained = True else: self._trained = False def computebandsize(self, threshold, nbrows): """ Compute the bandsize according to the threshold given """ ### t ~ (1/b)^(1/r), where t is the threshold, b the number of ### bands, and r the number of rows per band. And nbrows (the length ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's ### find the root of f(x) = (x/L)^(1/r) - t. def f(x): y = pow(x/nbrows, 1. /x) - threshold return y ## Solve f(x) = 0, with x having values in [1, nbrows] return int(bisect(f, 1, nbrows)) def predict(self, threshold): """ Return a set of tuples of *possible* similar sentences """ if not self._trained: print "Train it before" return if not (0 < threshold <= 1): print "Threshold must be in ]0 ; 1]" return sig = self.sigmatrix # Treshold is a percent of similarity # It should be inverted here (0 is closed, 1 is far) threshold = 1 - threshold bandsize = self.computebandsize(threshold, self.sigmatrix.shape[0]) buckets = defaultdict(set) similars = set() for r in xrange(0, sig.shape[0], bandsize): buckets.clear() for i in xrange(sig.shape[1]): buckets[tuple(sig[r:r+bandsize, i])].add(i) similars.update(set(tuple(v) for v in buckets.itervalues() if len(v) > 1)) return similars