From 7266e2787bfa661d490be9c4463e707e6ffe1715 Mon Sep 17 00:00:00 2001 From: Niclas Dobbertin Date: Thu, 14 Sep 2023 21:38:25 +0200 Subject: uncomment grouped metrics, with new groups --- bjoern/videoanalyse/post_processing.py | 118 ++++++++++++++------------------- 1 file changed, 48 insertions(+), 70 deletions(-) (limited to 'bjoern/videoanalyse/post_processing.py') diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index cc0599f..94ce067 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -6,7 +6,8 @@ import numpy as np import pandas as pd import Levenshtein import csv -from itertools import pairwise +from itertools import groupby +from operator import itemgetter from sklearn.metrics import pairwise_distances from sklearn.cluster import DBSCAN from pprint import pprint @@ -18,41 +19,6 @@ args = argparser.parse_args() data_path = Path(args.vp_dir) - -# def insertion_cost(char): -# return 1.0 - - -# def deletion_cost(char): -# return 1.0 - - -# def substitution_cost(char_a, char_b): -# if char_a == "t" and char_b == "r": -# return 0.5 -# return 1.0 - - -# weighted_levenshtein = WeightedLevenshtein( -# substitution_cost_fn=substitution_cost, -# insertion_cost_fn=insertion_cost, -# deletion_cost_fn=deletion_cost, -# ) - -# Distance threshold to define "same" url -dist_threshold = 5 - - -# Function to return all elements in candidates that are similar to original -def take_similar(original, candidates): - print(original) - print(candidates) - result = [ - x for x in candidates if dist_threshold >= Levenshtein.distance(original, x) - ] - return result - - # Read results.csv # with open(data_path / "metrics.csv", "r") as csvfile: # reader = csv.reader(csvfile, quotechar='"') @@ -84,45 +50,57 @@ def group_urls(urls): # TODO: eps and min_samples parameter db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix) labels = db.labels_ - pprint(list(zip(urls, labels))) - return labels + zipped = zip(urls, labels) + + # grouping solution from: https://www.geeksforgeeks.org/python-group-tuple-into-list-based-on-value/ + # create an empty dictionary to store the grouped tuples + grouped_dict = {} -labels = group_urls(urls) -print(list(labels)) + # loop through the tuples in the list + for tup in zipped: + # get the second element of the tuple + key = tup[1] + # if the key is not already in the dictionary, add it with an empty list as value + if key not in grouped_dict: + grouped_dict[key] = [] + # append the current tuple to the list corresponding to the key in the dictionary + grouped_dict[key].append(tup[0]) + # convert the dictionary values to lists and store in res + url_groups = [v for _, v in grouped_dict.items()] -# urls = [[0, "Start"]] -# for url in all_urls: -# if len(url[1]) > 0: -# urls.append([float(url[0]), url[1]]) + return url_groups -# Iterate over list of all urls, putting similar one into a group and removing them from -# the original list -# url_groups = [] -# while len(all_urls) > 0: -# group = take_similar(all_urls[0], all_urls) -# url_groups.append(group) -# for url in group: -# all_urls.remove(url) +url_groups = group_urls(urls) +pprint(len(url_groups)) # # for every row check which group its url belongs to and add a column with group indices # # also add columns with longest/most frequent url in group -# with open (data_path / "metrics.csv", "r") as input_file, \ -# open(data_path / "metrics_grps.csv", "w", newline='') as output_file: -# csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) -# csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) -# header = next(csv_reader) -# header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"]) -# csv_writer.writerow(header) -# for row in csv_reader: -# for idx, grp in enumerate(url_groups): -# if row[3] in grp: -# row.append(idx) -# longest_in_grp = max(grp, key=len) -# row.append(longest_in_grp) -# row.append(Levenshtein.distance(row[6], longest_in_grp)) -# most_frequent_in_grp = max(set(grp), key=grp.count) -# row.append(most_frequent_in_grp) -# row.append(Levenshtein.distance(row[6], most_frequent_in_grp)) -# csv_writer.writerow(row) +with open(data_path / "metrics.csv", "r") as input_file, open( + data_path / "metrics_grps.csv", "w", newline="" +) as output_file: + csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + header = next(csv_reader) + header.extend( + [ + "group_index", + "longest", + "longest-distance", + "most_frequent", + "most_frequent-distance", + ] + ) + csv_writer.writerow(header) + for row in csv_reader: + for idx, grp in enumerate(url_groups): + if row[3] in grp: + row.append(idx) + longest_in_grp = max(grp, key=len) + row.append(longest_in_grp) + row.append(Levenshtein.distance(row[6], longest_in_grp)) + most_frequent_in_grp = max(set(grp), key=grp.count) + row.append(most_frequent_in_grp) + row.append(Levenshtein.distance(row[6], most_frequent_in_grp)) + csv_writer.writerow(row) -- cgit v1.2.3