3 files changed, 189 insertions, 0 deletions
diff --git a/bjoern/videoanalyse/combine_ocr-logs.py b/bjoern/videoanalyse/combine_ocr-logs.py
new file mode 100644
index 0000000..76c59ed
--- /dev/null
+++ b/bjoern/videoanalyse/combine_ocr-logs.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+from datetime import datetime, timedelta
+import pandas as pd
+import csv
+
+argparser = argparse.ArgumentParser(
+    description="Combines results of OCR analysis with log files"
+)
+argparser.add_argument(
+    "vp_dir", help="Directory containing analysis_results.csv and VPCODE.csv"
+)
+
+args = argparser.parse_args()
+
+vp_path = Path(args.vp_dir)
+
+video_path = next(vp_path.glob("*.mkv"))
+date_format = "%Y-%m-%d %H-%M-%S"
+video_date = datetime.strptime(video_path.stem, date_format)
+print(video_date)
+# video_delta = timedelta(hours=video_date.hour, minutes=video_date.minute, seconds=video_date.second)
+
+def add_video_time_to_start(x, video_date):
+    start = timedelta(seconds=int(x))
+    return (start + video_date).time().isoformat()
+
+analysis = pd.read_csv(vp_path / "analysis_results.csv")
+analysis["Starttime"] = analysis["start_time"].apply(add_video_time_to_start, args=(video_date,))
+
+logs = pd.read_csv(vp_path / f"{vp_path.name}.csv")
+
+def get_log_url(start_time):
+    start_time = datetime.strptime(start_time, "%H:%M:%S")
+
+    for _, row in logs.iterrows():
+        log_start = datetime.strptime(row[0], "%H:%M:%S")
+        log_end = datetime.strptime(row[1], "%H:%M:%S")
+        if start_time >= log_start and start_time <= log_end:
+            return row[3]
+    return 0
+
+
+
+analysis["log_url"] = analysis.apply(
+        lambda row: get_log_url(row.Starttime), axis=1
+    )
+
+
+analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC)
diff --git a/bjoern/videoanalyse/eval.py b/bjoern/videoanalyse/eval.py
new file mode 100644
index 0000000..a917b7a
--- /dev/null
+++ b/bjoern/videoanalyse/eval.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import csv
+from Levenshtein import distance as levendist
+
+
+argparser = argparse.ArgumentParser(
+    description="Distance evaluation"
+)
+argparser.add_argument(
+    "vp_dir", help="Directory containing merged.csv"
+)
+
+args = argparser.parse_args()
+
+vp_path = Path(args.vp_dir)
+
+df = pd.read_csv(vp_path / "merged.csv")
+
+df["levenshtein-distance"] = df.apply(
+    lambda row: levendist(str(row.url), str(row.log_url)), axis=1
+)
+
+
+
+df.to_csv(vp_path / "metrics.csv", quoting=csv.QUOTE_NONNUMERIC)
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
new file mode 100644
index 0000000..d32457e
--- /dev/null
+++ b/bjoern/videoanalyse/post_processing.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import Levenshtein
+import csv
+from itertools import pairwise
+
+argparser = argparse.ArgumentParser(
+    description="Distance evaluation"
+)
+argparser.add_argument(
+    "vp_dir", help="Directory containing metrics.csv"
+)
+
+args = argparser.parse_args()
+
+data_path = Path(args.vp_dir )
+
+
+# def insertion_cost(char):
+#     return 1.0
+
+
+# def deletion_cost(char):
+#     return 1.0
+
+
+# def substitution_cost(char_a, char_b):
+#     if char_a == "t" and char_b == "r":
+#         return 0.5
+#     return 1.0
+
+
+# weighted_levenshtein = WeightedLevenshtein(
+#     substitution_cost_fn=substitution_cost,
+#     insertion_cost_fn=insertion_cost,
+#     deletion_cost_fn=deletion_cost,
+# )
+
+# Distance threshold to define "same" url
+dist_threshold = 5
+
+
+# Function to return all elements in candidates that are similar to original
+def take_similar(original, candidates):
+    print(original)
+    print(candidates)
+    result = [
+        x
+        for x in candidates
+        if dist_threshold >= Levenshtein.distance(original, x)
+    ]
+    return result
+
+
+# Read results.csv
+# with open(data_path / "metrics.csv", "r") as csvfile:
+#     reader = csv.reader(csvfile, quotechar='"')
+#     print(next(reader))
+#
+df = pd.read_csv(data_path / "metrics.csv")
+df = df.fillna('')
+
+
+# List with only urls
+all_urls = list(df["url"].values)
+urls = list(df["url"].values)
+
+# urls = [[0, "Start"]]
+# for url in all_urls:
+#     if len(url[1]) > 0:
+#         urls.append([float(url[0]), url[1]])
+
+
+# Iterate over list of all urls, putting similar one into a group and removing them from
+# the original list
+url_groups = []
+while len(all_urls) > 0:
+    group = take_similar(all_urls[0], all_urls)
+    url_groups.append([set(group), 0])
+    for url in group:
+        all_urls.remove(url)
+
+# Iterate over result-elements pairwise, removing elements under distance threshold
+# and always cumulating time of url-groups
+new_urls = []
+cum_times = []
+for pair in pairwise(urls):
+    print(pair)
+    dist = Levenshtein.distance(pair[0], pair[1])
+    if dist > dist_threshold:
+        new_urls.append(pair[1])
+
+
+with open(data_path / "grouping_post.cvs", "w") as csvfile:
+    writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+    writer.writerow(["url"])
+    for line in new_urls:
+        writer.writerow(line)
+
+with open(data_path / "all_urls.txt", "w") as f:
+    for group in url_groups:
+        f.write("=== new group, cumulative_time: {}\n".format(group[1]))
+        for url in group[0]:
+            f.write(url)
+            f.write("\n")