From 6093556d6aff781cc3892cf3a8bc93e388d5abb2 Mon Sep 17 00:00:00 2001 From: Niclas Dobbertin Date: Wed, 25 Oct 2023 23:04:27 +0200 Subject: extend ocr evaluation results --- bjoern/videoanalyse/utils.py | 49 +++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 21 deletions(-) (limited to 'bjoern/videoanalyse/utils.py') diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py index df00482..f9b2ca0 100644 --- a/bjoern/videoanalyse/utils.py +++ b/bjoern/videoanalyse/utils.py @@ -42,13 +42,15 @@ def combine_ocr_logs(video_path, ocr_path, log_path): # analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC) return analysis + def levendist_normalized(s1, s2, log_url): return levendist(s1, s2) / len(str(log_url)) def calc_levenshtein_distance(df): df["levenshtein-distance"] = df.apply( - lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1 + lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), + axis=1, ) return df @@ -124,44 +126,49 @@ def write_grouped_metrics(df, url_groups, data_path): row.append(levendist_normalized(log_url, longest_in_grp, log_url)) most_frequent_in_grp = max(set(grp), key=grp.count) row.append(str(most_frequent_in_grp)) - row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url)) + row.append( + levendist_normalized(log_url, most_frequent_in_grp, log_url) + ) csv_writer.writerow(row) -def evaluate_results(vp_results): - vp_code = [df["vp_code"].values[0] for df in vp_results] - # mean_lev = [ - # sum(df["levenshtein-distance"].values) / len(df["levenshtein-distance"]) - # for df in vp_results - # ] +def evaluate_results(tab_results, log_tab_results): + vp_code = [df["vp_code"].values[0] for df in tab_results] + mean_long = [] mean_freq = [] - for df in vp_results: - groups = set(df["group"].values) + count_groups = [] + count_log_urls = [] + count_grp_diff = [] + grp_tabswitches = [] + tabswitches_diff = [] + for tab_df, log_tab_df in zip(tab_results, log_tab_results): + groups = set(tab_df["group"].values) group_long = 0 group_freq = 0 + count_groups.append(len(groups)) + count_log_urls.append(len(set(tab_df["log_url"].values))) + count_grp_diff.append(len(groups) - len(set(tab_df["log_url"].values))) + grp_tabswitches.append(len(tab_df["group"].values)) + tabswitches_diff = len(tab_df["group"].values) - len(log_tab_df["group"].values) for group in groups: - group_df = df.loc[df['group'] == group] + group_df = tab_df.loc[tab_df["group"] == group] group_long += group_df["longest-distance"].values[0] group_freq += group_df["most_frequent-distance"].values[0] mean_long.append(group_long / len(groups)) mean_freq.append(group_freq / len(groups)) - # mean_long = [ - # sum(df["longest-distance"].values) / len(df["longest-distance"]) - # for df in vp_results - # ] - # mean_freq = [ - # sum(df["most_frequent-distance"].values) / len(df["most_frequent-distance"]) - # for df in vp_results - # ] - metrics = { "vp_code": vp_code, - # "mean_lev": mean_lev, "mean_long": mean_long, "mean_freq": mean_freq, + "count_groups": count_groups, + # "count_log_urls": count_log_urls, + "count_grp_diff": count_grp_diff, + "grp_tabswitches": grp_tabswitches, + "tabswitches_diff": tabswitches_diff, } + evals = pd.DataFrame(metrics) return evals -- cgit v1.2.3