summaryrefslogtreecommitdiff
path: root/bjoern/videoanalyse
diff options
context:
space:
mode:
Diffstat (limited to 'bjoern/videoanalyse')
-rw-r--r--bjoern/videoanalyse/post_processing.py118
1 files changed, 48 insertions, 70 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index cc0599f..94ce067 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -6,7 +6,8 @@ import numpy as np
import pandas as pd
import Levenshtein
import csv
-from itertools import pairwise
+from itertools import groupby
+from operator import itemgetter
from sklearn.metrics import pairwise_distances
from sklearn.cluster import DBSCAN
from pprint import pprint
@@ -18,41 +19,6 @@ args = argparser.parse_args()
data_path = Path(args.vp_dir)
-
-# def insertion_cost(char):
-# return 1.0
-
-
-# def deletion_cost(char):
-# return 1.0
-
-
-# def substitution_cost(char_a, char_b):
-# if char_a == "t" and char_b == "r":
-# return 0.5
-# return 1.0
-
-
-# weighted_levenshtein = WeightedLevenshtein(
-# substitution_cost_fn=substitution_cost,
-# insertion_cost_fn=insertion_cost,
-# deletion_cost_fn=deletion_cost,
-# )
-
-# Distance threshold to define "same" url
-dist_threshold = 5
-
-
-# Function to return all elements in candidates that are similar to original
-def take_similar(original, candidates):
- print(original)
- print(candidates)
- result = [
- x for x in candidates if dist_threshold >= Levenshtein.distance(original, x)
- ]
- return result
-
-
# Read results.csv
# with open(data_path / "metrics.csv", "r") as csvfile:
# reader = csv.reader(csvfile, quotechar='"')
@@ -84,45 +50,57 @@ def group_urls(urls):
# TODO: eps and min_samples parameter
db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
labels = db.labels_
- pprint(list(zip(urls, labels)))
- return labels
+ zipped = zip(urls, labels)
+
+ # grouping solution from: https://www.geeksforgeeks.org/python-group-tuple-into-list-based-on-value/
+ # create an empty dictionary to store the grouped tuples
+ grouped_dict = {}
-labels = group_urls(urls)
-print(list(labels))
+ # loop through the tuples in the list
+ for tup in zipped:
+ # get the second element of the tuple
+ key = tup[1]
+ # if the key is not already in the dictionary, add it with an empty list as value
+ if key not in grouped_dict:
+ grouped_dict[key] = []
+ # append the current tuple to the list corresponding to the key in the dictionary
+ grouped_dict[key].append(tup[0])
+ # convert the dictionary values to lists and store in res
+ url_groups = [v for _, v in grouped_dict.items()]
-# urls = [[0, "Start"]]
-# for url in all_urls:
-# if len(url[1]) > 0:
-# urls.append([float(url[0]), url[1]])
+ return url_groups
-# Iterate over list of all urls, putting similar one into a group and removing them from
-# the original list
-# url_groups = []
-# while len(all_urls) > 0:
-# group = take_similar(all_urls[0], all_urls)
-# url_groups.append(group)
-# for url in group:
-# all_urls.remove(url)
+url_groups = group_urls(urls)
+pprint(len(url_groups))
# # for every row check which group its url belongs to and add a column with group indices
# # also add columns with longest/most frequent url in group
-# with open (data_path / "metrics.csv", "r") as input_file, \
-# open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
-# csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-# csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-# header = next(csv_reader)
-# header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
-# csv_writer.writerow(header)
-# for row in csv_reader:
-# for idx, grp in enumerate(url_groups):
-# if row[3] in grp:
-# row.append(idx)
-# longest_in_grp = max(grp, key=len)
-# row.append(longest_in_grp)
-# row.append(Levenshtein.distance(row[6], longest_in_grp))
-# most_frequent_in_grp = max(set(grp), key=grp.count)
-# row.append(most_frequent_in_grp)
-# row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
-# csv_writer.writerow(row)
+with open(data_path / "metrics.csv", "r") as input_file, open(
+ data_path / "metrics_grps.csv", "w", newline=""
+) as output_file:
+ csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+ csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+ header = next(csv_reader)
+ header.extend(
+ [
+ "group_index",
+ "longest",
+ "longest-distance",
+ "most_frequent",
+ "most_frequent-distance",
+ ]
+ )
+ csv_writer.writerow(header)
+ for row in csv_reader:
+ for idx, grp in enumerate(url_groups):
+ if row[3] in grp:
+ row.append(idx)
+ longest_in_grp = max(grp, key=len)
+ row.append(longest_in_grp)
+ row.append(Levenshtein.distance(row[6], longest_in_grp))
+ most_frequent_in_grp = max(set(grp), key=grp.count)
+ row.append(most_frequent_in_grp)
+ row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
+ csv_writer.writerow(row)