bjoern/videoanalyse/post_processing.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

#!/usr/bin/env python3

import argparse
from pathlib import Path
import numpy as np
import pandas as pd
import Levenshtein
import csv
from itertools import groupby
from operator import itemgetter
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
from pprint import pprint

argparser = argparse.ArgumentParser(description="Distance evaluation")
argparser.add_argument("vp_dir", help="Directory containing metrics.csv")

args = argparser.parse_args()

data_path = Path(args.vp_dir)

# Read results.csv
# with open(data_path / "metrics.csv", "r") as csvfile:
#     reader = csv.reader(csvfile, quotechar='"')
#     print(next(reader))
#


df = pd.read_csv(data_path / "metrics.csv")
df = df.fillna("")


# List with only urls
all_urls = list(df["url"].values)
urls = list(df["url"].values)


def group_urls(urls):
    unique_urls = np.unique(urls)

    # TODO: casting deprecation np
    def levenshtein_from_idx(idx1, idx2):
        return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)])

    X = np.searchsorted(unique_urls, list([[x] for x in urls]))

    distance_matrix = pairwise_distances(
        X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1
    )
    # TODO: eps and min_samples parameter
    db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
    labels = db.labels_
    zipped = zip(urls, labels)

    # grouping solution from: https://www.geeksforgeeks.org/python-group-tuple-into-list-based-on-value/
    # create an empty dictionary to store the grouped tuples
    grouped_dict = {}

    # loop through the tuples in the list
    for tup in zipped:
        # get the second element of the tuple
        key = tup[1]
        # if the key is not already in the dictionary, add it with an empty list as value
        if key not in grouped_dict:
            grouped_dict[key] = []
        # append the current tuple to the list corresponding to the key in the dictionary
        grouped_dict[key].append(tup[0])

    # convert the dictionary values to lists and store in res
    url_groups = [v for _, v in grouped_dict.items()]

    return url_groups


url_groups = group_urls(urls)
pprint(len(url_groups))

# # for every row check which group its url belongs to and add a column with group indices
# # also add columns with longest/most frequent url in group
with open(data_path / "metrics.csv", "r") as input_file, open(
    data_path / "metrics_grps.csv", "w", newline=""
) as output_file:
    csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
    csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
    header = next(csv_reader)
    header.extend(
        [
            "group_index",
            "longest",
            "longest-distance",
            "most_frequent",
            "most_frequent-distance",
        ]
    )
    csv_writer.writerow(header)
    for row in csv_reader:
        for idx, grp in enumerate(url_groups):
            if row[3] in grp:
                row.append(idx)
                longest_in_grp = max(grp, key=len)
                row.append(longest_in_grp)
                row.append(Levenshtein.distance(row[6], longest_in_grp))
                most_frequent_in_grp = max(set(grp), key=grp.count)
                row.append(most_frequent_in_grp)
                row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
        csv_writer.writerow(row)