#!/usr/bin/env python3 import argparse from pathlib import Path import pandas as pd import Levenshtein import csv from itertools import pairwise argparser = argparse.ArgumentParser( description="Distance evaluation" ) argparser.add_argument( "vp_dir", help="Directory containing metrics.csv" ) args = argparser.parse_args() data_path = Path(args.vp_dir ) # def insertion_cost(char): # return 1.0 # def deletion_cost(char): # return 1.0 # def substitution_cost(char_a, char_b): # if char_a == "t" and char_b == "r": # return 0.5 # return 1.0 # weighted_levenshtein = WeightedLevenshtein( # substitution_cost_fn=substitution_cost, # insertion_cost_fn=insertion_cost, # deletion_cost_fn=deletion_cost, # ) # Distance threshold to define "same" url dist_threshold = 5 # Function to return all elements in candidates that are similar to original def take_similar(original, candidates): print(original) print(candidates) result = [ x for x in candidates if dist_threshold >= Levenshtein.distance(original, x) ] return result # Read results.csv # with open(data_path / "metrics.csv", "r") as csvfile: # reader = csv.reader(csvfile, quotechar='"') # print(next(reader)) # df = pd.read_csv(data_path / "metrics.csv") df = df.fillna('') # List with only urls all_urls = list(df["url"].values) urls = list(df["url"].values) # urls = [[0, "Start"]] # for url in all_urls: # if len(url[1]) > 0: # urls.append([float(url[0]), url[1]]) # Iterate over list of all urls, putting similar one into a group and removing them from # the original list url_groups = [] while len(all_urls) > 0: group = take_similar(all_urls[0], all_urls) url_groups.append([set(group), 0]) for url in group: all_urls.remove(url) # Iterate over result-elements pairwise, removing elements under distance threshold # and always cumulating time of url-groups new_urls = [] cum_times = [] for pair in pairwise(urls): print(pair) dist = Levenshtein.distance(pair[0], pair[1]) if dist > dist_threshold: new_urls.append(pair[1]) with open(data_path / "grouping_post.csv", "w") as csvfile: writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) writer.writerow(["url"]) for line in new_urls: writer.writerow(line) with open(data_path / "all_urls.txt", "w") as f: for group in url_groups: f.write("=== new group, cumulative_time: {}\n".format(group[1])) for url in group[0]: f.write(url) f.write("\n")