Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions neofuzz/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def index(self, options: Iterable[str]):
n_dimensions = vectors.shape[1]
self.nearest_neighbours = AnnoyIndex(n_dimensions, self.metric)
for i_option, vector in enumerate(vectors):
self.nearest_neighbours.add_item(i_option, vector)
array = vector.toarray()[0,:]
self.nearest_neighbours.add_item(i_option, array)
self.nearest_neighbours.build(self.n_trees, n_jobs=self.n_jobs)

def query(
Expand Down Expand Up @@ -115,8 +116,9 @@ def query(
indices = []
distances = []
for query_vector in search_matrix:
query_array = query_vector.toarray()[0,:]
ind, dist = self.nearest_neighbours.get_nns_by_vector(
query_vector, limit, include_distances=True
query_array, limit, include_distances=True
)
indices.append(ind)
distances.append(dist)
Expand Down Expand Up @@ -236,7 +238,11 @@ def ratio(self, s1: str, s2: str) -> int:
" please index before getting ratios."
)
v1, v2 = self.vectorizer.transform([s1, s2])
distance = pairwise_distances(v1, v2, metric=self.metric)
if self.metric == 'angular':
metric = 'cosine'
else:
metric = self.metric
distance = pairwise_distances(v1, v2, metric=metric)
distance = np.ravel(distance)[0]
score = (1 - distance) * 100
return int(score)
Expand Down Expand Up @@ -288,7 +294,7 @@ def from_disk(cls, save_dir: Union[str, Path]):
def char_ngram_process(
ngram_range: Tuple[int, int] = (1, 5),
tf_idf: bool = True,
metric: str = "cosine",
metric: str = "angular",
refine_levenshtein: bool = False,
) -> Process:
"""Basic character n-gram based fuzzy search process.
Expand All @@ -300,7 +306,7 @@ def char_ngram_process(
n-grams.
tf_idf: bool, default True
Flag signifying whether the features should be tf-idf weighted.
metric: str, default 'cosine'
metric: str, default 'angular'
Distance metric to use for fuzzy search.
refine_levenshtein: bool, default None
Indicates whether results should be refined with Levenshtein distance
Expand Down