summaryrefslogtreecommitdiff
path: root/bjoern/videoanalyse/post_processing.py
diff options
context:
space:
mode:
authorNiclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>2023-08-02 14:13:45 +0200
committerNiclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>2023-08-02 14:13:45 +0200
commit4d9c0dce1f5bc0bd3cde1b89875387f1a13c18c4 (patch)
tree661fe5eb571ae7dc89ea52c546f6aa5df0cc3595 /bjoern/videoanalyse/post_processing.py
parent3fdab9495310e8d29cab65d1ee0c1bfa0b26a76e (diff)
add additional scripts
Diffstat (limited to 'bjoern/videoanalyse/post_processing.py')
-rw-r--r--bjoern/videoanalyse/post_processing.py108
1 files changed, 108 insertions, 0 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
new file mode 100644
index 0000000..d32457e
--- /dev/null
+++ b/bjoern/videoanalyse/post_processing.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import Levenshtein
+import csv
+from itertools import pairwise
+
+argparser = argparse.ArgumentParser(
+ description="Distance evaluation"
+)
+argparser.add_argument(
+ "vp_dir", help="Directory containing metrics.csv"
+)
+
+args = argparser.parse_args()
+
+data_path = Path(args.vp_dir )
+
+
+# def insertion_cost(char):
+# return 1.0
+
+
+# def deletion_cost(char):
+# return 1.0
+
+
+# def substitution_cost(char_a, char_b):
+# if char_a == "t" and char_b == "r":
+# return 0.5
+# return 1.0
+
+
+# weighted_levenshtein = WeightedLevenshtein(
+# substitution_cost_fn=substitution_cost,
+# insertion_cost_fn=insertion_cost,
+# deletion_cost_fn=deletion_cost,
+# )
+
+# Distance threshold to define "same" url
+dist_threshold = 5
+
+
+# Function to return all elements in candidates that are similar to original
+def take_similar(original, candidates):
+ print(original)
+ print(candidates)
+ result = [
+ x
+ for x in candidates
+ if dist_threshold >= Levenshtein.distance(original, x)
+ ]
+ return result
+
+
+# Read results.csv
+# with open(data_path / "metrics.csv", "r") as csvfile:
+# reader = csv.reader(csvfile, quotechar='"')
+# print(next(reader))
+#
+df = pd.read_csv(data_path / "metrics.csv")
+df = df.fillna('')
+
+
+# List with only urls
+all_urls = list(df["url"].values)
+urls = list(df["url"].values)
+
+# urls = [[0, "Start"]]
+# for url in all_urls:
+# if len(url[1]) > 0:
+# urls.append([float(url[0]), url[1]])
+
+
+# Iterate over list of all urls, putting similar one into a group and removing them from
+# the original list
+url_groups = []
+while len(all_urls) > 0:
+ group = take_similar(all_urls[0], all_urls)
+ url_groups.append([set(group), 0])
+ for url in group:
+ all_urls.remove(url)
+
+# Iterate over result-elements pairwise, removing elements under distance threshold
+# and always cumulating time of url-groups
+new_urls = []
+cum_times = []
+for pair in pairwise(urls):
+ print(pair)
+ dist = Levenshtein.distance(pair[0], pair[1])
+ if dist > dist_threshold:
+ new_urls.append(pair[1])
+
+
+with open(data_path / "grouping_post.cvs", "w") as csvfile:
+ writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+ writer.writerow(["url"])
+ for line in new_urls:
+ writer.writerow(line)
+
+with open(data_path / "all_urls.txt", "w") as f:
+ for group in url_groups:
+ f.write("=== new group, cumulative_time: {}\n".format(group[1]))
+ for url in group[0]:
+ f.write(url)
+ f.write("\n")