#!/usr/bin/env python3 import argparse from pathlib import Path import numpy as np import pandas as pd import Levenshtein import csv from itertools import pairwise from sklearn.metrics import pairwise_distances from sklearn.cluster import DBSCAN from pprint import pprint argparser = argparse.ArgumentParser(description="Distance evaluation") argparser.add_argument("vp_dir", help="Directory containing metrics.csv") args = argparser.parse_args() data_path = Path(args.vp_dir) # def insertion_cost(char): # return 1.0 # def deletion_cost(char): # return 1.0 # def substitution_cost(char_a, char_b): # if char_a == "t" and char_b == "r": # return 0.5 # return 1.0 # weighted_levenshtein = WeightedLevenshtein( # substitution_cost_fn=substitution_cost, # insertion_cost_fn=insertion_cost, # deletion_cost_fn=deletion_cost, # ) # Distance threshold to define "same" url dist_threshold = 5 # Function to return all elements in candidates that are similar to original def take_similar(original, candidates): print(original) print(candidates) result = [ x for x in candidates if dist_threshold >= Levenshtein.distance(original, x) ] return result # Read results.csv # with open(data_path / "metrics.csv", "r") as csvfile: # reader = csv.reader(csvfile, quotechar='"') # print(next(reader)) # df = pd.read_csv(data_path / "metrics.csv") df = df.fillna("") # List with only urls all_urls = list(df["url"].values) urls = list(df["url"].values) def group_urls(urls): unique_urls = np.unique(urls) # TODO: casting deprecation np def levenshtein_from_idx(idx1, idx2): return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)]) X = np.searchsorted(unique_urls, list([[x] for x in urls])) distance_matrix = pairwise_distances( X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1 ) # TODO: eps and min_samples parameter db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix) labels = db.labels_ pprint(list(zip(urls, labels))) return labels labels = group_urls(urls) print(list(labels)) # urls = [[0, "Start"]] # for url in all_urls: # if len(url[1]) > 0: # urls.append([float(url[0]), url[1]]) # Iterate over list of all urls, putting similar one into a group and removing them from # the original list # url_groups = [] # while len(all_urls) > 0: # group = take_similar(all_urls[0], all_urls) # url_groups.append(group) # for url in group: # all_urls.remove(url) # # for every row check which group its url belongs to and add a column with group indices # # also add columns with longest/most frequent url in group # with open (data_path / "metrics.csv", "r") as input_file, \ # open(data_path / "metrics_grps.csv", "w", newline='') as output_file: # csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) # csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) # header = next(csv_reader) # header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"]) # csv_writer.writerow(header) # for row in csv_reader: # for idx, grp in enumerate(url_groups): # if row[3] in grp: # row.append(idx) # longest_in_grp = max(grp, key=len) # row.append(longest_in_grp) # row.append(Levenshtein.distance(row[6], longest_in_grp)) # most_frequent_in_grp = max(set(grp), key=grp.count) # row.append(most_frequent_in_grp) # row.append(Levenshtein.distance(row[6], most_frequent_in_grp)) # csv_writer.writerow(row)