summaryrefslogtreecommitdiff
path: root/bjoern
diff options
context:
space:
mode:
Diffstat (limited to 'bjoern')
-rw-r--r--bjoern/videoanalyse/post_processing.py9
-rw-r--r--bjoern/videoanalyse/tab_switch.py14
-rw-r--r--bjoern/videoanalyse/utils.py49
3 files changed, 38 insertions, 34 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index 445b72f..c295dc6 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -20,6 +20,7 @@ all_vp = [x for x in data_path.iterdir() if x.is_dir()]
vp_results = []
tab_results = []
+log_tab_results = []
for vp_path in all_vp:
log = LogParser.extract_activities(LogParser.get_log_data(vp_path))
log = LogParser.generate_log(log)
@@ -45,11 +46,15 @@ for vp_path in all_vp:
df = pd.read_csv(f"{vp_path}/metrics_grps.csv")
- tab_df = tab_switch.tab_switches_per_type(df)
+ tab_df = tab_switch.tab_switches_per_type(df, "group_index")
tab_results.append(tab_df)
tab_df.to_csv(f"{vp_path}/tabswitches.csv")
+ log_tab_df = tab_switch.tab_switches_per_type(df, "log_url")
+ log_tab_results.append(log_tab_df)
+ log_tab_df.to_csv(f"{vp_path}/log_tabswitches.csv")
+
vp_results.append(df)
-evals = utils.evaluate_results(tab_results)
+evals = utils.evaluate_results(tab_results, log_tab_results)
evals.to_csv(f"{data_path}/evaluation.csv")
diff --git a/bjoern/videoanalyse/tab_switch.py b/bjoern/videoanalyse/tab_switch.py
index 2efa647..a0bb5de 100644
--- a/bjoern/videoanalyse/tab_switch.py
+++ b/bjoern/videoanalyse/tab_switch.py
@@ -3,15 +3,7 @@
import pandas as pd
-def tab_switches_per_type(df):
- # url_types = ["url", "log_url", "longest", "most_frequent"]
- # dist_types = [
- # "levenshtein-distance",
- # "levenshtein-distance",
- # "longest-distance",
- # "most_frequent-distance",
- # ]
-
+def tab_switches_per_type(df, key_column):
result = {
"vp_code": [],
"log_url": [],
@@ -26,7 +18,7 @@ def tab_switches_per_type(df):
count = -1
for row in df.iterrows():
row = row[1]
- if row["group_index"] != last_group:
+ if row[key_column] != last_group:
result["vp_code"].append(row["vp_code"])
result["group"].append(row["group_index"])
result["log_url"].append(row["log_url"])
@@ -35,7 +27,7 @@ def tab_switches_per_type(df):
result["most_frequent_url"].append(row["most_frequent"])
result["most_frequent-distance"].append(row["most_frequent-distance"])
- last_group = row["group_index"]
+ last_group = row[key_column]
if count == -1:
count = 1
continue
diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py
index df00482..f9b2ca0 100644
--- a/bjoern/videoanalyse/utils.py
+++ b/bjoern/videoanalyse/utils.py
@@ -42,13 +42,15 @@ def combine_ocr_logs(video_path, ocr_path, log_path):
# analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC)
return analysis
+
def levendist_normalized(s1, s2, log_url):
return levendist(s1, s2) / len(str(log_url))
def calc_levenshtein_distance(df):
df["levenshtein-distance"] = df.apply(
- lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1
+ lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)),
+ axis=1,
)
return df
@@ -124,44 +126,49 @@ def write_grouped_metrics(df, url_groups, data_path):
row.append(levendist_normalized(log_url, longest_in_grp, log_url))
most_frequent_in_grp = max(set(grp), key=grp.count)
row.append(str(most_frequent_in_grp))
- row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url))
+ row.append(
+ levendist_normalized(log_url, most_frequent_in_grp, log_url)
+ )
csv_writer.writerow(row)
-def evaluate_results(vp_results):
- vp_code = [df["vp_code"].values[0] for df in vp_results]
- # mean_lev = [
- # sum(df["levenshtein-distance"].values) / len(df["levenshtein-distance"])
- # for df in vp_results
- # ]
+def evaluate_results(tab_results, log_tab_results):
+ vp_code = [df["vp_code"].values[0] for df in tab_results]
+
mean_long = []
mean_freq = []
- for df in vp_results:
- groups = set(df["group"].values)
+ count_groups = []
+ count_log_urls = []
+ count_grp_diff = []
+ grp_tabswitches = []
+ tabswitches_diff = []
+ for tab_df, log_tab_df in zip(tab_results, log_tab_results):
+ groups = set(tab_df["group"].values)
group_long = 0
group_freq = 0
+ count_groups.append(len(groups))
+ count_log_urls.append(len(set(tab_df["log_url"].values)))
+ count_grp_diff.append(len(groups) - len(set(tab_df["log_url"].values)))
+ grp_tabswitches.append(len(tab_df["group"].values))
+ tabswitches_diff = len(tab_df["group"].values) - len(log_tab_df["group"].values)
for group in groups:
- group_df = df.loc[df['group'] == group]
+ group_df = tab_df.loc[tab_df["group"] == group]
group_long += group_df["longest-distance"].values[0]
group_freq += group_df["most_frequent-distance"].values[0]
mean_long.append(group_long / len(groups))
mean_freq.append(group_freq / len(groups))
- # mean_long = [
- # sum(df["longest-distance"].values) / len(df["longest-distance"])
- # for df in vp_results
- # ]
- # mean_freq = [
- # sum(df["most_frequent-distance"].values) / len(df["most_frequent-distance"])
- # for df in vp_results
- # ]
-
metrics = {
"vp_code": vp_code,
- # "mean_lev": mean_lev,
"mean_long": mean_long,
"mean_freq": mean_freq,
+ "count_groups": count_groups,
+ # "count_log_urls": count_log_urls,
+ "count_grp_diff": count_grp_diff,
+ "grp_tabswitches": grp_tabswitches,
+ "tabswitches_diff": tabswitches_diff,
}
+
evals = pd.DataFrame(metrics)
return evals