diff options
-rw-r--r-- | bjoern/videoanalyse/combine_ocr-logs.py | 52 | ||||
-rw-r--r-- | bjoern/videoanalyse/eval.py | 29 | ||||
-rw-r--r-- | bjoern/videoanalyse/post_processing.py | 108 |
3 files changed, 189 insertions, 0 deletions
diff --git a/bjoern/videoanalyse/combine_ocr-logs.py b/bjoern/videoanalyse/combine_ocr-logs.py new file mode 100644 index 0000000..76c59ed --- /dev/null +++ b/bjoern/videoanalyse/combine_ocr-logs.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path +from datetime import datetime, timedelta +import pandas as pd +import csv + +argparser = argparse.ArgumentParser( + description="Combines results of OCR analysis with log files" +) +argparser.add_argument( + "vp_dir", help="Directory containing analysis_results.csv and VPCODE.csv" +) + +args = argparser.parse_args() + +vp_path = Path(args.vp_dir) + +video_path = next(vp_path.glob("*.mkv")) +date_format = "%Y-%m-%d %H-%M-%S" +video_date = datetime.strptime(video_path.stem, date_format) +print(video_date) +# video_delta = timedelta(hours=video_date.hour, minutes=video_date.minute, seconds=video_date.second) + +def add_video_time_to_start(x, video_date): + start = timedelta(seconds=int(x)) + return (start + video_date).time().isoformat() + +analysis = pd.read_csv(vp_path / "analysis_results.csv") +analysis["Starttime"] = analysis["start_time"].apply(add_video_time_to_start, args=(video_date,)) + +logs = pd.read_csv(vp_path / f"{vp_path.name}.csv") + +def get_log_url(start_time): + start_time = datetime.strptime(start_time, "%H:%M:%S") + + for _, row in logs.iterrows(): + log_start = datetime.strptime(row[0], "%H:%M:%S") + log_end = datetime.strptime(row[1], "%H:%M:%S") + if start_time >= log_start and start_time <= log_end: + return row[3] + return 0 + + + +analysis["log_url"] = analysis.apply( + lambda row: get_log_url(row.Starttime), axis=1 + ) + + +analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC) diff --git a/bjoern/videoanalyse/eval.py b/bjoern/videoanalyse/eval.py new file mode 100644 index 0000000..a917b7a --- /dev/null +++ b/bjoern/videoanalyse/eval.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path +import pandas as pd +import csv +from Levenshtein import distance as levendist + + +argparser = argparse.ArgumentParser( + description="Distance evaluation" +) +argparser.add_argument( + "vp_dir", help="Directory containing merged.csv" +) + +args = argparser.parse_args() + +vp_path = Path(args.vp_dir) + +df = pd.read_csv(vp_path / "merged.csv") + +df["levenshtein-distance"] = df.apply( + lambda row: levendist(str(row.url), str(row.log_url)), axis=1 +) + + + +df.to_csv(vp_path / "metrics.csv", quoting=csv.QUOTE_NONNUMERIC) diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py new file mode 100644 index 0000000..d32457e --- /dev/null +++ b/bjoern/videoanalyse/post_processing.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path +import pandas as pd +import Levenshtein +import csv +from itertools import pairwise + +argparser = argparse.ArgumentParser( + description="Distance evaluation" +) +argparser.add_argument( + "vp_dir", help="Directory containing metrics.csv" +) + +args = argparser.parse_args() + +data_path = Path(args.vp_dir ) + + +# def insertion_cost(char): +# return 1.0 + + +# def deletion_cost(char): +# return 1.0 + + +# def substitution_cost(char_a, char_b): +# if char_a == "t" and char_b == "r": +# return 0.5 +# return 1.0 + + +# weighted_levenshtein = WeightedLevenshtein( +# substitution_cost_fn=substitution_cost, +# insertion_cost_fn=insertion_cost, +# deletion_cost_fn=deletion_cost, +# ) + +# Distance threshold to define "same" url +dist_threshold = 5 + + +# Function to return all elements in candidates that are similar to original +def take_similar(original, candidates): + print(original) + print(candidates) + result = [ + x + for x in candidates + if dist_threshold >= Levenshtein.distance(original, x) + ] + return result + + +# Read results.csv +# with open(data_path / "metrics.csv", "r") as csvfile: +# reader = csv.reader(csvfile, quotechar='"') +# print(next(reader)) +# +df = pd.read_csv(data_path / "metrics.csv") +df = df.fillna('') + + +# List with only urls +all_urls = list(df["url"].values) +urls = list(df["url"].values) + +# urls = [[0, "Start"]] +# for url in all_urls: +# if len(url[1]) > 0: +# urls.append([float(url[0]), url[1]]) + + +# Iterate over list of all urls, putting similar one into a group and removing them from +# the original list +url_groups = [] +while len(all_urls) > 0: + group = take_similar(all_urls[0], all_urls) + url_groups.append([set(group), 0]) + for url in group: + all_urls.remove(url) + +# Iterate over result-elements pairwise, removing elements under distance threshold +# and always cumulating time of url-groups +new_urls = [] +cum_times = [] +for pair in pairwise(urls): + print(pair) + dist = Levenshtein.distance(pair[0], pair[1]) + if dist > dist_threshold: + new_urls.append(pair[1]) + + +with open(data_path / "grouping_post.cvs", "w") as csvfile: + writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + writer.writerow(["url"]) + for line in new_urls: + writer.writerow(line) + +with open(data_path / "all_urls.txt", "w") as f: + for group in url_groups: + f.write("=== new group, cumulative_time: {}\n".format(group[1])) + for url in group[0]: + f.write(url) + f.write("\n") |