summaryrefslogtreecommitdiff
path: root/bjoern/videoanalyse/utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'bjoern/videoanalyse/utils.py')
-rw-r--r--bjoern/videoanalyse/utils.py49
1 files changed, 28 insertions, 21 deletions
diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py
index df00482..f9b2ca0 100644
--- a/bjoern/videoanalyse/utils.py
+++ b/bjoern/videoanalyse/utils.py
@@ -42,13 +42,15 @@ def combine_ocr_logs(video_path, ocr_path, log_path):
# analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC)
return analysis
+
def levendist_normalized(s1, s2, log_url):
return levendist(s1, s2) / len(str(log_url))
def calc_levenshtein_distance(df):
df["levenshtein-distance"] = df.apply(
- lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1
+ lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)),
+ axis=1,
)
return df
@@ -124,44 +126,49 @@ def write_grouped_metrics(df, url_groups, data_path):
row.append(levendist_normalized(log_url, longest_in_grp, log_url))
most_frequent_in_grp = max(set(grp), key=grp.count)
row.append(str(most_frequent_in_grp))
- row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url))
+ row.append(
+ levendist_normalized(log_url, most_frequent_in_grp, log_url)
+ )
csv_writer.writerow(row)
-def evaluate_results(vp_results):
- vp_code = [df["vp_code"].values[0] for df in vp_results]
- # mean_lev = [
- # sum(df["levenshtein-distance"].values) / len(df["levenshtein-distance"])
- # for df in vp_results
- # ]
+def evaluate_results(tab_results, log_tab_results):
+ vp_code = [df["vp_code"].values[0] for df in tab_results]
+
mean_long = []
mean_freq = []
- for df in vp_results:
- groups = set(df["group"].values)
+ count_groups = []
+ count_log_urls = []
+ count_grp_diff = []
+ grp_tabswitches = []
+ tabswitches_diff = []
+ for tab_df, log_tab_df in zip(tab_results, log_tab_results):
+ groups = set(tab_df["group"].values)
group_long = 0
group_freq = 0
+ count_groups.append(len(groups))
+ count_log_urls.append(len(set(tab_df["log_url"].values)))
+ count_grp_diff.append(len(groups) - len(set(tab_df["log_url"].values)))
+ grp_tabswitches.append(len(tab_df["group"].values))
+ tabswitches_diff = len(tab_df["group"].values) - len(log_tab_df["group"].values)
for group in groups:
- group_df = df.loc[df['group'] == group]
+ group_df = tab_df.loc[tab_df["group"] == group]
group_long += group_df["longest-distance"].values[0]
group_freq += group_df["most_frequent-distance"].values[0]
mean_long.append(group_long / len(groups))
mean_freq.append(group_freq / len(groups))
- # mean_long = [
- # sum(df["longest-distance"].values) / len(df["longest-distance"])
- # for df in vp_results
- # ]
- # mean_freq = [
- # sum(df["most_frequent-distance"].values) / len(df["most_frequent-distance"])
- # for df in vp_results
- # ]
-
metrics = {
"vp_code": vp_code,
- # "mean_lev": mean_lev,
"mean_long": mean_long,
"mean_freq": mean_freq,
+ "count_groups": count_groups,
+ # "count_log_urls": count_log_urls,
+ "count_grp_diff": count_grp_diff,
+ "grp_tabswitches": grp_tabswitches,
+ "tabswitches_diff": tabswitches_diff,
}
+
evals = pd.DataFrame(metrics)
return evals