From ba078d599987608f0a75f0275834af19b3e2dae3 Mon Sep 17 00:00:00 2001 From: Niclas Dobbertin Date: Wed, 4 Oct 2023 12:12:29 +0200 Subject: eval over all vps, result df --- bjoern/videoanalyse/post_processing.py | 34 +++++++++++++++++++++++----------- bjoern/videoanalyse/utils.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 13 deletions(-) (limited to 'bjoern/videoanalyse') diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index a8d37c4..6ffff1e 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -3,27 +3,39 @@ import argparse from pathlib import Path from pprint import pprint +import pandas as pd import utils argparser = argparse.ArgumentParser(description="OCR-Logfile evaluation") -argparser.add_argument("vp_dir", help="VP Directory") +argparser.add_argument("vp_dir", help="Directory with all VPs") args = argparser.parse_args() data_path = Path(args.vp_dir) -video_path = next(data_path.glob("*.mkv")) -ocr_path = data_path / "analysis_results.csv" -log_path = data_path / f"{data_path.stem}.csv" +all_vp = [x for x in data_path.iterdir() if x.is_dir()] -df = utils.combine_ocr_logs(video_path, ocr_path, log_path) -df = df.fillna('') +vp_results = [] +for vp_path in all_vp: + video_path = next(vp_path.glob("*.mkv")) + ocr_path = vp_path / "analysis_results.csv" + log_path = vp_path / f"{vp_path.stem}.csv" -df = utils.calc_levenshtein_distance(df) + df = utils.combine_ocr_logs(video_path, ocr_path, log_path) + df = df.fillna('') + df["vp_code"] = vp_path.stem + df = utils.calc_levenshtein_distance(df) -url_groups = utils.group_urls(list(df["url"].values)) -pprint(len(url_groups)) + url_groups = utils.group_urls(list(df["url"].values)) + pprint(len(url_groups)) -df.to_csv(f"{data_path}/metrics.csv") -utils.write_grouped_metrics(df, url_groups, data_path) + df.to_csv(f"{vp_path}/metrics.csv") + utils.write_grouped_metrics(df, url_groups, vp_path) + + df = pd.read_csv(f"{vp_path}/metrics_grps.csv") + + vp_results.append(df) + +evals = utils.evaluate_results(vp_results) +pprint(evals) diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py index e060a89..69ffa96 100644 --- a/bjoern/videoanalyse/utils.py +++ b/bjoern/videoanalyse/utils.py @@ -20,13 +20,11 @@ def combine_ocr_logs(video_path, ocr_path, log_path): start = timedelta(seconds=int(round(x))) return (start + video_date).time().isoformat() - # analysis = pd.read_csv(vp_path / "analysis_results.csv") analysis = pd.read_csv(ocr_path) analysis["Starttime"] = analysis["start_time"].apply( add_video_time_to_start, args=(video_date,) ) - # logs = pd.read_csv(vp_path / f"{vp_path.name}.csv") logs = pd.read_csv(log_path) def get_log_url(start_time): @@ -123,3 +121,29 @@ def write_grouped_metrics(df, url_groups, data_path): row.append(str(most_frequent_in_grp)) row.append(levendist(row[5], most_frequent_in_grp)) csv_writer.writerow(row) + + +def evaluate_results(vp_results): + vp_code = [df["vp_code"].values[0] for df in vp_results] + mean_lev = [ + sum(df["levenshtein-distance"].values) / len(df["levenshtein-distance"]) + for df in vp_results + ] + mean_long = [ + sum(df["longest-distance"].values) / len(df["longest-distance"]) + for df in vp_results + ] + mean_freq = [ + sum(df["most_frequent-distance"].values) / len(df["most_frequent-distance"]) + for df in vp_results + ] + + metrics = { + "vp_code": vp_code, + "mean_lev": mean_lev, + "mean_long": mean_long, + "mean_freq": mean_freq, + } + evals = pd.DataFrame(metrics) + + return evals -- cgit v1.2.3