#!/usr/bin/env python3 import argparse from pathlib import Path import pandas as pd import Levenshtein import csv from itertools import pairwise argparser = argparse.ArgumentParser( description="Distance evaluation" ) argparser.add_argument( "vp_dir", help="Directory containing metrics.csv" ) args = argparser.parse_args() data_path = Path(args.vp_dir ) # def insertion_cost(char): # return 1.0 # def deletion_cost(char): # return 1.0 # def substitution_cost(char_a, char_b): # if char_a == "t" and char_b == "r": # return 0.5 # return 1.0 # weighted_levenshtein = WeightedLevenshtein( # substitution_cost_fn=substitution_cost, # insertion_cost_fn=insertion_cost, # deletion_cost_fn=deletion_cost, # ) # Distance threshold to define "same" url dist_threshold = 5 # Function to return all elements in candidates that are similar to original def take_similar(original, candidates): print(original) print(candidates) result = [ x for x in candidates if dist_threshold >= Levenshtein.distance(original, x) ] return result # Read results.csv # with open(data_path / "metrics.csv", "r") as csvfile: # reader = csv.reader(csvfile, quotechar='"') # print(next(reader)) # df = pd.read_csv(data_path / "metrics.csv") df = df.fillna('') # List with only urls all_urls = list(df["url"].values) urls = list(df["url"].values) # urls = [[0, "Start"]] # for url in all_urls: # if len(url[1]) > 0: # urls.append([float(url[0]), url[1]]) # Iterate over list of all urls, putting similar one into a group and removing them from # the original list url_groups = [] while len(all_urls) > 0: group = take_similar(all_urls[0], all_urls) url_groups.append(group) for url in group: all_urls.remove(url) # for every row check which group its url belongs to and add a column with group indices # also add columns with longest/most frequent url in group with open (data_path / "metrics.csv", "r") as input_file, \ open(data_path / "metrics_grps.csv", "w", newline='') as output_file: csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) header = next(csv_reader) header.extend(["group_index","longest","most frequent"]) csv_writer.writerow(header) for row in csv_reader: for idx, grp in enumerate(url_groups): if row[3] in grp: row.append(idx) row.append(max(grp, key=len)) row.append(max(set(grp), key=grp.count)) csv_writer.writerow(row)