From 6093556d6aff781cc3892cf3a8bc93e388d5abb2 Mon Sep 17 00:00:00 2001 From: Niclas Dobbertin Date: Wed, 25 Oct 2023 23:04:27 +0200 Subject: extend ocr evaluation results --- bjoern/videoanalyse/post_processing.py | 9 +++++-- bjoern/videoanalyse/tab_switch.py | 14 +++------- bjoern/videoanalyse/utils.py | 49 +++++++++++++++++++--------------- 3 files changed, 38 insertions(+), 34 deletions(-) (limited to 'bjoern/videoanalyse') diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index 445b72f..c295dc6 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -20,6 +20,7 @@ all_vp = [x for x in data_path.iterdir() if x.is_dir()] vp_results = [] tab_results = [] +log_tab_results = [] for vp_path in all_vp: log = LogParser.extract_activities(LogParser.get_log_data(vp_path)) log = LogParser.generate_log(log) @@ -45,11 +46,15 @@ for vp_path in all_vp: df = pd.read_csv(f"{vp_path}/metrics_grps.csv") - tab_df = tab_switch.tab_switches_per_type(df) + tab_df = tab_switch.tab_switches_per_type(df, "group_index") tab_results.append(tab_df) tab_df.to_csv(f"{vp_path}/tabswitches.csv") + log_tab_df = tab_switch.tab_switches_per_type(df, "log_url") + log_tab_results.append(log_tab_df) + log_tab_df.to_csv(f"{vp_path}/log_tabswitches.csv") + vp_results.append(df) -evals = utils.evaluate_results(tab_results) +evals = utils.evaluate_results(tab_results, log_tab_results) evals.to_csv(f"{data_path}/evaluation.csv") diff --git a/bjoern/videoanalyse/tab_switch.py b/bjoern/videoanalyse/tab_switch.py index 2efa647..a0bb5de 100644 --- a/bjoern/videoanalyse/tab_switch.py +++ b/bjoern/videoanalyse/tab_switch.py @@ -3,15 +3,7 @@ import pandas as pd -def tab_switches_per_type(df): - # url_types = ["url", "log_url", "longest", "most_frequent"] - # dist_types = [ - # "levenshtein-distance", - # "levenshtein-distance", - # "longest-distance", - # "most_frequent-distance", - # ] - +def tab_switches_per_type(df, key_column): result = { "vp_code": [], "log_url": [], @@ -26,7 +18,7 @@ def tab_switches_per_type(df): count = -1 for row in df.iterrows(): row = row[1] - if row["group_index"] != last_group: + if row[key_column] != last_group: result["vp_code"].append(row["vp_code"]) result["group"].append(row["group_index"]) result["log_url"].append(row["log_url"]) @@ -35,7 +27,7 @@ def tab_switches_per_type(df): result["most_frequent_url"].append(row["most_frequent"]) result["most_frequent-distance"].append(row["most_frequent-distance"]) - last_group = row["group_index"] + last_group = row[key_column] if count == -1: count = 1 continue diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py index df00482..f9b2ca0 100644 --- a/bjoern/videoanalyse/utils.py +++ b/bjoern/videoanalyse/utils.py @@ -42,13 +42,15 @@ def combine_ocr_logs(video_path, ocr_path, log_path): # analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC) return analysis + def levendist_normalized(s1, s2, log_url): return levendist(s1, s2) / len(str(log_url)) def calc_levenshtein_distance(df): df["levenshtein-distance"] = df.apply( - lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1 + lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), + axis=1, ) return df @@ -124,44 +126,49 @@ def write_grouped_metrics(df, url_groups, data_path): row.append(levendist_normalized(log_url, longest_in_grp, log_url)) most_frequent_in_grp = max(set(grp), key=grp.count) row.append(str(most_frequent_in_grp)) - row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url)) + row.append( + levendist_normalized(log_url, most_frequent_in_grp, log_url) + ) csv_writer.writerow(row) -def evaluate_results(vp_results): - vp_code = [df["vp_code"].values[0] for df in vp_results] - # mean_lev = [ - # sum(df["levenshtein-distance"].values) / len(df["levenshtein-distance"]) - # for df in vp_results - # ] +def evaluate_results(tab_results, log_tab_results): + vp_code = [df["vp_code"].values[0] for df in tab_results] + mean_long = [] mean_freq = [] - for df in vp_results: - groups = set(df["group"].values) + count_groups = [] + count_log_urls = [] + count_grp_diff = [] + grp_tabswitches = [] + tabswitches_diff = [] + for tab_df, log_tab_df in zip(tab_results, log_tab_results): + groups = set(tab_df["group"].values) group_long = 0 group_freq = 0 + count_groups.append(len(groups)) + count_log_urls.append(len(set(tab_df["log_url"].values))) + count_grp_diff.append(len(groups) - len(set(tab_df["log_url"].values))) + grp_tabswitches.append(len(tab_df["group"].values)) + tabswitches_diff = len(tab_df["group"].values) - len(log_tab_df["group"].values) for group in groups: - group_df = df.loc[df['group'] == group] + group_df = tab_df.loc[tab_df["group"] == group] group_long += group_df["longest-distance"].values[0] group_freq += group_df["most_frequent-distance"].values[0] mean_long.append(group_long / len(groups)) mean_freq.append(group_freq / len(groups)) - # mean_long = [ - # sum(df["longest-distance"].values) / len(df["longest-distance"]) - # for df in vp_results - # ] - # mean_freq = [ - # sum(df["most_frequent-distance"].values) / len(df["most_frequent-distance"]) - # for df in vp_results - # ] - metrics = { "vp_code": vp_code, - # "mean_lev": mean_lev, "mean_long": mean_long, "mean_freq": mean_freq, + "count_groups": count_groups, + # "count_log_urls": count_log_urls, + "count_grp_diff": count_grp_diff, + "grp_tabswitches": grp_tabswitches, + "tabswitches_diff": tabswitches_diff, } + evals = pd.DataFrame(metrics) return evals -- cgit v1.2.3