bjoern/videoanalyse/post_processing.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

#!/usr/bin/env python3

import argparse
from pathlib import Path
import numpy as np
import pandas as pd
import Levenshtein
import csv
from itertools import pairwise
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
from pprint import pprint

argparser = argparse.ArgumentParser(description="Distance evaluation")
argparser.add_argument("vp_dir", help="Directory containing metrics.csv")

args = argparser.parse_args()

data_path = Path(args.vp_dir)


# def insertion_cost(char):
#     return 1.0


# def deletion_cost(char):
#     return 1.0


# def substitution_cost(char_a, char_b):
#     if char_a == "t" and char_b == "r":
#         return 0.5
#     return 1.0


# weighted_levenshtein = WeightedLevenshtein(
#     substitution_cost_fn=substitution_cost,
#     insertion_cost_fn=insertion_cost,
#     deletion_cost_fn=deletion_cost,
# )

# Distance threshold to define "same" url
dist_threshold = 5


# Function to return all elements in candidates that are similar to original
def take_similar(original, candidates):
    print(original)
    print(candidates)
    result = [
        x for x in candidates if dist_threshold >= Levenshtein.distance(original, x)
    ]
    return result


# Read results.csv
# with open(data_path / "metrics.csv", "r") as csvfile:
#     reader = csv.reader(csvfile, quotechar='"')
#     print(next(reader))
#


df = pd.read_csv(data_path / "metrics.csv")
df = df.fillna("")


# List with only urls
all_urls = list(df["url"].values)
urls = list(df["url"].values)


def group_urls(urls):
    unique_urls = np.unique(urls)

    # TODO: casting deprecation np
    def levenshtein_from_idx(idx1, idx2):
        return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)])

    X = np.searchsorted(unique_urls, list([[x] for x in urls]))

    distance_matrix = pairwise_distances(
        X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1
    )
    # TODO: eps and min_samples parameter
    db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
    labels = db.labels_
    pprint(list(zip(urls, labels)))
    return labels

labels = group_urls(urls)
print(list(labels))


# urls = [[0, "Start"]]
# for url in all_urls:
#     if len(url[1]) > 0:
#         urls.append([float(url[0]), url[1]])


# Iterate over list of all urls, putting similar one into a group and removing them from
# the original list
# url_groups = []
# while len(all_urls) > 0:
#     group = take_similar(all_urls[0], all_urls)
#     url_groups.append(group)
#     for url in group:
#         all_urls.remove(url)

# # for every row check which group its url belongs to and add a column with group indices
# # also add columns with longest/most frequent url in group
# with open (data_path / "metrics.csv", "r") as input_file, \
#     open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
#     csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
#     csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
#     header = next(csv_reader)
#     header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
#     csv_writer.writerow(header)
#     for row in csv_reader:
#         for idx, grp in enumerate(url_groups):
#             if row[3] in grp:
#                 row.append(idx)
#                 longest_in_grp = max(grp, key=len)
#                 row.append(longest_in_grp)
#                 row.append(Levenshtein.distance(row[6], longest_in_grp))
#                 most_frequent_in_grp = max(set(grp), key=grp.count)
#                 row.append(most_frequent_in_grp)
#                 row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
#         csv_writer.writerow(row)