From 14099a1af49176e619981e0db7f29c739517d8f5 Mon Sep 17 00:00:00 2001 From: areyoumee Date: Wed, 4 Oct 2023 12:01:22 +0200 Subject: normalized levendist --- bjoern/videoanalyse/utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'bjoern') diff --git a/bjoern/videoanalyse/utils.py b/bjoern/videoanalyse/utils.py index e060a89..69215ff 100644 --- a/bjoern/videoanalyse/utils.py +++ b/bjoern/videoanalyse/utils.py @@ -44,10 +44,13 @@ def combine_ocr_logs(video_path, ocr_path, log_path): # analysis.to_csv(vp_path / "merged.csv", quoting=csv.QUOTE_NONNUMERIC) return analysis +def levendist_normalized(s1, s2, log_url): + return levendist(s1, s2) / len(str(log_url)) + def calc_levenshtein_distance(df): df["levenshtein-distance"] = df.apply( - lambda row: levendist(str(row.url), str(row.log_url)), axis=1 + lambda row: levendist(str(row.url), str(row.log_url)) / len(str(row.log_url)), axis=1 ) return df @@ -113,13 +116,15 @@ def write_grouped_metrics(df, url_groups, data_path): ) csv_writer.writerow(header) for row in csv_reader: + ocr_url = row[2] + log_url = row[5] for idx, grp in enumerate(url_groups): - if row[2] in grp: + if ocr_url in grp: row.append(str(idx)) longest_in_grp = max(grp, key=len) row.append(longest_in_grp) - row.append(levendist(row[5], longest_in_grp)) + row.append(levendist_normalized(log_url, longest_in_grp, log_url)) most_frequent_in_grp = max(set(grp), key=grp.count) row.append(str(most_frequent_in_grp)) - row.append(levendist(row[5], most_frequent_in_grp)) + row.append(levendist_normalized(log_url, most_frequent_in_grp, log_url)) csv_writer.writerow(row) -- cgit v1.2.3