From 4d9c0dce1f5bc0bd3cde1b89875387f1a13c18c4 Mon Sep 17 00:00:00 2001
From: Niclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>
Date: Wed, 2 Aug 2023 14:13:45 +0200
Subject: add additional scripts

---
 bjoern/videoanalyse/post_processing.py | 108 +++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 bjoern/videoanalyse/post_processing.py

(limited to 'bjoern/videoanalyse/post_processing.py')

diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
new file mode 100644
index 0000000..d32457e
--- /dev/null
+++ b/bjoern/videoanalyse/post_processing.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import Levenshtein
+import csv
+from itertools import pairwise
+
+argparser = argparse.ArgumentParser(
+    description="Distance evaluation"
+)
+argparser.add_argument(
+    "vp_dir", help="Directory containing metrics.csv"
+)
+
+args = argparser.parse_args()
+
+data_path = Path(args.vp_dir )
+
+
+# def insertion_cost(char):
+#     return 1.0
+
+
+# def deletion_cost(char):
+#     return 1.0
+
+
+# def substitution_cost(char_a, char_b):
+#     if char_a == "t" and char_b == "r":
+#         return 0.5
+#     return 1.0
+
+
+# weighted_levenshtein = WeightedLevenshtein(
+#     substitution_cost_fn=substitution_cost,
+#     insertion_cost_fn=insertion_cost,
+#     deletion_cost_fn=deletion_cost,
+# )
+
+# Distance threshold to define "same" url
+dist_threshold = 5
+
+
+# Function to return all elements in candidates that are similar to original
+def take_similar(original, candidates):
+    print(original)
+    print(candidates)
+    result = [
+        x
+        for x in candidates
+        if dist_threshold >= Levenshtein.distance(original, x)
+    ]
+    return result
+
+
+# Read results.csv
+# with open(data_path / "metrics.csv", "r") as csvfile:
+#     reader = csv.reader(csvfile, quotechar='"')
+#     print(next(reader))
+#
+df = pd.read_csv(data_path / "metrics.csv")
+df = df.fillna('')
+
+
+# List with only urls
+all_urls = list(df["url"].values)
+urls = list(df["url"].values)
+
+# urls = [[0, "Start"]]
+# for url in all_urls:
+#     if len(url[1]) > 0:
+#         urls.append([float(url[0]), url[1]])
+
+
+# Iterate over list of all urls, putting similar one into a group and removing them from
+# the original list
+url_groups = []
+while len(all_urls) > 0:
+    group = take_similar(all_urls[0], all_urls)
+    url_groups.append([set(group), 0])
+    for url in group:
+        all_urls.remove(url)
+
+# Iterate over result-elements pairwise, removing elements under distance threshold
+# and always cumulating time of url-groups
+new_urls = []
+cum_times = []
+for pair in pairwise(urls):
+    print(pair)
+    dist = Levenshtein.distance(pair[0], pair[1])
+    if dist > dist_threshold:
+        new_urls.append(pair[1])
+
+
+with open(data_path / "grouping_post.cvs", "w") as csvfile:
+    writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+    writer.writerow(["url"])
+    for line in new_urls:
+        writer.writerow(line)
+
+with open(data_path / "all_urls.txt", "w") as f:
+    for group in url_groups:
+        f.write("=== new group, cumulative_time: {}\n".format(group[1]))
+        for url in group[0]:
+            f.write(url)
+            f.write("\n")
-- 
cgit v1.2.3