#!/usr/bin/env python3

import argparse
from pathlib import Path
import pandas as pd
import Levenshtein
import csv
from itertools import pairwise

argparser = argparse.ArgumentParser(
    description="Distance evaluation"
)
argparser.add_argument(
    "vp_dir", help="Directory containing metrics.csv"
)

args = argparser.parse_args()

data_path = Path(args.vp_dir )


# def insertion_cost(char):
#     return 1.0


# def deletion_cost(char):
#     return 1.0


# def substitution_cost(char_a, char_b):
#     if char_a == "t" and char_b == "r":
#         return 0.5
#     return 1.0


# weighted_levenshtein = WeightedLevenshtein(
#     substitution_cost_fn=substitution_cost,
#     insertion_cost_fn=insertion_cost,
#     deletion_cost_fn=deletion_cost,
# )

# Distance threshold to define "same" url
dist_threshold = 5


# Function to return all elements in candidates that are similar to original
def take_similar(original, candidates):
    print(original)
    print(candidates)
    result = [
        x
        for x in candidates
        if dist_threshold >= Levenshtein.distance(original, x)
    ]
    return result


# Read results.csv
# with open(data_path / "metrics.csv", "r") as csvfile:
#     reader = csv.reader(csvfile, quotechar='"')
#     print(next(reader))
#
df = pd.read_csv(data_path / "metrics.csv")
df = df.fillna('')


# List with only urls
all_urls = list(df["url"].values)
urls = list(df["url"].values)

# urls = [[0, "Start"]]
# for url in all_urls:
#     if len(url[1]) > 0:
#         urls.append([float(url[0]), url[1]])


# Iterate over list of all urls, putting similar one into a group and removing them from
# the original list
url_groups = []
while len(all_urls) > 0:
    group = take_similar(all_urls[0], all_urls)
    url_groups.append([set(group), 0])
    for url in group:
        all_urls.remove(url)

# Iterate over result-elements pairwise, removing elements under distance threshold
# and always cumulating time of url-groups
new_urls = []
cum_times = []
for pair in pairwise(urls):
    print(pair)
    dist = Levenshtein.distance(pair[0], pair[1])
    if dist > dist_threshold:
        new_urls.append(pair[1])


with open(data_path / "grouping_post.csv", "w") as csvfile:
    writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
    writer.writerow(["url"])
    for line in new_urls:
        writer.writerow(line)

with open(data_path / "all_urls.txt", "w") as f:
    for group in url_groups:
        f.write("=== new group, cumulative_time: {}\n".format(group[1]))
        for url in group[0]:
            f.write(url)
            f.write("\n")