diff options
Diffstat (limited to 'bjoern/videoanalyse')
-rw-r--r-- | bjoern/videoanalyse/utils.py | 13 |
1 files changed, 9 insertions, 4 deletions
diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py index 69ffa96..b1eaa4f 100644 --- a/bjoern/videoanalyse/utils.py +++ b/bjoern/videoanalyse/utils.py @@ -42,10 +42,13 @@ def combine_ocr_logs(video_path, ocr_path, log_path): # analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC) return analysis +def levendist_normalized(s1, s2, log_url): + return levendist(s1, s2) / len(str(log_url)) + def calc_levenshtein_distance(df): df["levenshtein-distance"] = df.apply( - lambda row: levendist(str(row.url), str(row.log_url)), axis=1 + lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1 ) return df @@ -111,15 +114,17 @@ def write_grouped_metrics(df, url_groups, data_path): ) csv_writer.writerow(header) for row in csv_reader: + ocr_url = row[2] + log_url = row[5] for idx, grp in enumerate(url_groups): - if row[2] in grp: + if ocr_url in grp: row.append(str(idx)) longest_in_grp = max(grp, key=len) row.append(longest_in_grp) - row.append(levendist(row[5], longest_in_grp)) + row.append(levendist_normalized(log_url, longest_in_grp, log_url)) most_frequent_in_grp = max(set(grp), key=grp.count) row.append(str(most_frequent_in_grp)) - row.append(levendist(row[5], most_frequent_in_grp)) + row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url)) csv_writer.writerow(row) |