summaryrefslogtreecommitdiff
path: root/bjoern/videoanalyse
diff options
context:
space:
mode:
Diffstat (limited to 'bjoern/videoanalyse')
-rw-r--r--bjoern/videoanalyse/utils.py13
1 files changed, 9 insertions, 4 deletions
diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py
index 69ffa96..b1eaa4f 100644
--- a/bjoern/videoanalyse/utils.py
+++ b/bjoern/videoanalyse/utils.py
@@ -42,10 +42,13 @@ def combine_ocr_logs(video_path, ocr_path, log_path):
# analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC)
return analysis
+def levendist_normalized(s1, s2, log_url):
+ return levendist(s1, s2) / len(str(log_url))
+
def calc_levenshtein_distance(df):
df["levenshtein-distance"] = df.apply(
- lambda row: levendist(str(row.url), str(row.log_url)), axis=1
+ lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1
)
return df
@@ -111,15 +114,17 @@ def write_grouped_metrics(df, url_groups, data_path):
)
csv_writer.writerow(header)
for row in csv_reader:
+ ocr_url = row[2]
+ log_url = row[5]
for idx, grp in enumerate(url_groups):
- if row[2] in grp:
+ if ocr_url in grp:
row.append(str(idx))
longest_in_grp = max(grp, key=len)
row.append(longest_in_grp)
- row.append(levendist(row[5], longest_in_grp))
+ row.append(levendist_normalized(log_url, longest_in_grp, log_url))
most_frequent_in_grp = max(set(grp), key=grp.count)
row.append(str(most_frequent_in_grp))
- row.append(levendist(row[5], most_frequent_in_grp))
+ row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url))
csv_writer.writerow(row)