summaryrefslogtreecommitdiff
path: root/bjoern
diff options
context:
space:
mode:
Diffstat (limited to 'bjoern')
-rw-r--r--bjoern/videoanalyse/combine_ocr-logs.py52
-rw-r--r--bjoern/videoanalyse/eval.py29
-rw-r--r--bjoern/videoanalyse/post_processing.py108
3 files changed, 189 insertions, 0 deletions
diff --git a/bjoern/videoanalyse/combine_ocr-logs.py b/bjoern/videoanalyse/combine_ocr-logs.py
new file mode 100644
index 0000000..76c59ed
--- /dev/null
+++ b/bjoern/videoanalyse/combine_ocr-logs.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+from datetime import datetime, timedelta
+import pandas as pd
+import csv
+
+argparser = argparse.ArgumentParser(
+ description="Combines results of OCR analysis with log files"
+)
+argparser.add_argument(
+ "vp_dir", help="Directory containing analysis_results.csv and VPCODE.csv"
+)
+
+args = argparser.parse_args()
+
+vp_path = Path(args.vp_dir)
+
+video_path = next(vp_path.glob("*.mkv"))
+date_format = "%Y-%m-%d %H-%M-%S"
+video_date = datetime.strptime(video_path.stem, date_format)
+print(video_date)
+# video_delta = timedelta(hours=video_date.hour, minutes=video_date.minute, seconds=video_date.second)
+
+def add_video_time_to_start(x, video_date):
+ start = timedelta(seconds=int(x))
+ return (start + video_date).time().isoformat()
+
+analysis = pd.read_csv(vp_path / "analysis_results.csv")
+analysis["Starttime"] = analysis["start_time"].apply(add_video_time_to_start, args=(video_date,))
+
+logs = pd.read_csv(vp_path / f"{vp_path.name}.csv")
+
+def get_log_url(start_time):
+ start_time = datetime.strptime(start_time, "%H:%M:%S")
+
+ for _, row in logs.iterrows():
+ log_start = datetime.strptime(row[0], "%H:%M:%S")
+ log_end = datetime.strptime(row[1], "%H:%M:%S")
+ if start_time >= log_start and start_time <= log_end:
+ return row[3]
+ return 0
+
+
+
+analysis["log_url"] = analysis.apply(
+ lambda row: get_log_url(row.Starttime), axis=1
+ )
+
+
+analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC)
diff --git a/bjoern/videoanalyse/eval.py b/bjoern/videoanalyse/eval.py
new file mode 100644
index 0000000..a917b7a
--- /dev/null
+++ b/bjoern/videoanalyse/eval.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import csv
+from Levenshtein import distance as levendist
+
+
+argparser = argparse.ArgumentParser(
+ description="Distance evaluation"
+)
+argparser.add_argument(
+ "vp_dir", help="Directory containing merged.csv"
+)
+
+args = argparser.parse_args()
+
+vp_path = Path(args.vp_dir)
+
+df = pd.read_csv(vp_path / "merged.csv")
+
+df["levenshtein-distance"] = df.apply(
+ lambda row: levendist(str(row.url), str(row.log_url)), axis=1
+)
+
+
+
+df.to_csv(vp_path / "metrics.csv", quoting=csv.QUOTE_NONNUMERIC)
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
new file mode 100644
index 0000000..d32457e
--- /dev/null
+++ b/bjoern/videoanalyse/post_processing.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import Levenshtein
+import csv
+from itertools import pairwise
+
+argparser = argparse.ArgumentParser(
+ description="Distance evaluation"
+)
+argparser.add_argument(
+ "vp_dir", help="Directory containing metrics.csv"
+)
+
+args = argparser.parse_args()
+
+data_path = Path(args.vp_dir )
+
+
+# def insertion_cost(char):
+# return 1.0
+
+
+# def deletion_cost(char):
+# return 1.0
+
+
+# def substitution_cost(char_a, char_b):
+# if char_a == "t" and char_b == "r":
+# return 0.5
+# return 1.0
+
+
+# weighted_levenshtein = WeightedLevenshtein(
+# substitution_cost_fn=substitution_cost,
+# insertion_cost_fn=insertion_cost,
+# deletion_cost_fn=deletion_cost,
+# )
+
+# Distance threshold to define "same" url
+dist_threshold = 5
+
+
+# Function to return all elements in candidates that are similar to original
+def take_similar(original, candidates):
+ print(original)
+ print(candidates)
+ result = [
+ x
+ for x in candidates
+ if dist_threshold >= Levenshtein.distance(original, x)
+ ]
+ return result
+
+
+# Read results.csv
+# with open(data_path / "metrics.csv", "r") as csvfile:
+# reader = csv.reader(csvfile, quotechar='"')
+# print(next(reader))
+#
+df = pd.read_csv(data_path / "metrics.csv")
+df = df.fillna('')
+
+
+# List with only urls
+all_urls = list(df["url"].values)
+urls = list(df["url"].values)
+
+# urls = [[0, "Start"]]
+# for url in all_urls:
+# if len(url[1]) > 0:
+# urls.append([float(url[0]), url[1]])
+
+
+# Iterate over list of all urls, putting similar one into a group and removing them from
+# the original list
+url_groups = []
+while len(all_urls) > 0:
+ group = take_similar(all_urls[0], all_urls)
+ url_groups.append([set(group), 0])
+ for url in group:
+ all_urls.remove(url)
+
+# Iterate over result-elements pairwise, removing elements under distance threshold
+# and always cumulating time of url-groups
+new_urls = []
+cum_times = []
+for pair in pairwise(urls):
+ print(pair)
+ dist = Levenshtein.distance(pair[0], pair[1])
+ if dist > dist_threshold:
+ new_urls.append(pair[1])
+
+
+with open(data_path / "grouping_post.cvs", "w") as csvfile:
+ writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+ writer.writerow(["url"])
+ for line in new_urls:
+ writer.writerow(line)
+
+with open(data_path / "all_urls.txt", "w") as f:
+ for group in url_groups:
+ f.write("=== new group, cumulative_time: {}\n".format(group[1]))
+ for url in group[0]:
+ f.write(url)
+ f.write("\n")