diff --git a/neofuzz/process.py b/neofuzz/process.py index c049575..d91cadb 100644 --- a/neofuzz/process.py +++ b/neofuzz/process.py @@ -76,7 +76,8 @@ def index(self, options: Iterable[str]): n_dimensions = vectors.shape[1] self.nearest_neighbours = AnnoyIndex(n_dimensions, self.metric) for i_option, vector in enumerate(vectors): - self.nearest_neighbours.add_item(i_option, vector) + array = vector.toarray()[0,:] + self.nearest_neighbours.add_item(i_option, array) self.nearest_neighbours.build(self.n_trees, n_jobs=self.n_jobs) def query( @@ -115,8 +116,9 @@ def query( indices = [] distances = [] for query_vector in search_matrix: + query_array = query_vector.toarray()[0,:] ind, dist = self.nearest_neighbours.get_nns_by_vector( - query_vector, limit, include_distances=True + query_array, limit, include_distances=True ) indices.append(ind) distances.append(dist) @@ -236,7 +238,11 @@ def ratio(self, s1: str, s2: str) -> int: " please index before getting ratios." ) v1, v2 = self.vectorizer.transform([s1, s2]) - distance = pairwise_distances(v1, v2, metric=self.metric) + if self.metric == 'angular': + metric = 'cosine' + else: + metric = self.metric + distance = pairwise_distances(v1, v2, metric=metric) distance = np.ravel(distance)[0] score = (1 - distance) * 100 return int(score) @@ -288,7 +294,7 @@ def from_disk(cls, save_dir: Union[str, Path]): def char_ngram_process( ngram_range: Tuple[int, int] = (1, 5), tf_idf: bool = True, - metric: str = "cosine", + metric: str = "angular", refine_levenshtein: bool = False, ) -> Process: """Basic character n-gram based fuzzy search process. @@ -300,7 +306,7 @@ def char_ngram_process( n-grams. tf_idf: bool, default True Flag signifying whether the features should be tf-idf weighted. - metric: str, default 'cosine' + metric: str, default 'angular' Distance metric to use for fuzzy search. refine_levenshtein: bool, default None Indicates whether results should be refined with Levenshtein distance