1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
#!/usr/bin/env python3
import argparse
from pathlib import Path
import pandas as pd
import Levenshtein
import csv
from itertools import pairwise
argparser = argparse.ArgumentParser(
description="Distance evaluation"
)
argparser.add_argument(
"vp_dir", help="Directory containing metrics.csv"
)
args = argparser.parse_args()
data_path = Path(args.vp_dir )
# def insertion_cost(char):
# return 1.0
# def deletion_cost(char):
# return 1.0
# def substitution_cost(char_a, char_b):
# if char_a == "t" and char_b == "r":
# return 0.5
# return 1.0
# weighted_levenshtein = WeightedLevenshtein(
# substitution_cost_fn=substitution_cost,
# insertion_cost_fn=insertion_cost,
# deletion_cost_fn=deletion_cost,
# )
# Distance threshold to define "same" url
dist_threshold = 5
# Function to return all elements in candidates that are similar to original
def take_similar(original, candidates):
print(original)
print(candidates)
result = [
x
for x in candidates
if dist_threshold >= Levenshtein.distance(original, x)
]
return result
# Read results.csv
# with open(data_path / "metrics.csv", "r") as csvfile:
# reader = csv.reader(csvfile, quotechar='"')
# print(next(reader))
#
df = pd.read_csv(data_path / "metrics.csv")
df = df.fillna('')
# List with only urls
all_urls = list(df["url"].values)
urls = list(df["url"].values)
# urls = [[0, "Start"]]
# for url in all_urls:
# if len(url[1]) > 0:
# urls.append([float(url[0]), url[1]])
# Iterate over list of all urls, putting similar one into a group and removing them from
# the original list
url_groups = []
while len(all_urls) > 0:
group = take_similar(all_urls[0], all_urls)
url_groups.append([set(group), 0])
for url in group:
all_urls.remove(url)
# Iterate over result-elements pairwise, removing elements under distance threshold
# and always cumulating time of url-groups
new_urls = []
cum_times = []
for pair in pairwise(urls):
print(pair)
dist = Levenshtein.distance(pair[0], pair[1])
if dist > dist_threshold:
new_urls.append(pair[1])
with open(data_path / "grouping_post.csv", "w") as csvfile:
writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
writer.writerow(["url"])
for line in new_urls:
writer.writerow(line)
with open(data_path / "all_urls.txt", "w") as f:
for group in url_groups:
f.write("=== new group, cumulative_time: {}\n".format(group[1]))
for url in group[0]:
f.write(url)
f.write("\n")
|