From 7a0e5ea2acc577e51ab7e46e424e62d6fdeef405 Mon Sep 17 00:00:00 2001 From: Niclas Dobbertin Date: Thu, 14 Sep 2023 20:29:11 +0200 Subject: use DBSCAN for url grouping --- bjoern/videoanalyse/post_processing.py | 99 +++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 38 deletions(-) (limited to 'bjoern') diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index 6ab2b0f..cc0599f 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -2,21 +2,21 @@ import argparse from pathlib import Path +import numpy as np import pandas as pd import Levenshtein import csv from itertools import pairwise +from sklearn.metrics import pairwise_distances +from sklearn.cluster import DBSCAN +from pprint import pprint -argparser = argparse.ArgumentParser( - description="Distance evaluation" -) -argparser.add_argument( - "vp_dir", help="Directory containing metrics.csv" -) +argparser = argparse.ArgumentParser(description="Distance evaluation") +argparser.add_argument("vp_dir", help="Directory containing metrics.csv") args = argparser.parse_args() -data_path = Path(args.vp_dir ) +data_path = Path(args.vp_dir) # def insertion_cost(char): @@ -48,9 +48,7 @@ def take_similar(original, candidates): print(original) print(candidates) result = [ - x - for x in candidates - if dist_threshold >= Levenshtein.distance(original, x) + x for x in candidates if dist_threshold >= Levenshtein.distance(original, x) ] return result @@ -60,14 +58,39 @@ def take_similar(original, candidates): # reader = csv.reader(csvfile, quotechar='"') # print(next(reader)) # + + df = pd.read_csv(data_path / "metrics.csv") -df = df.fillna('') +df = df.fillna("") # List with only urls all_urls = list(df["url"].values) urls = list(df["url"].values) + +def group_urls(urls): + unique_urls = np.unique(urls) + + # TODO: casting deprecation np + def levenshtein_from_idx(idx1, idx2): + return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)]) + + X = np.searchsorted(unique_urls, list([[x] for x in urls])) + + distance_matrix = pairwise_distances( + X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1 + ) + # TODO: eps and min_samples parameter + db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix) + labels = db.labels_ + pprint(list(zip(urls, labels))) + return labels + +labels = group_urls(urls) +print(list(labels)) + + # urls = [[0, "Start"]] # for url in all_urls: # if len(url[1]) > 0: @@ -76,30 +99,30 @@ urls = list(df["url"].values) # Iterate over list of all urls, putting similar one into a group and removing them from # the original list -url_groups = [] -while len(all_urls) > 0: - group = take_similar(all_urls[0], all_urls) - url_groups.append(group) - for url in group: - all_urls.remove(url) - -# for every row check which group its url belongs to and add a column with group indices -# also add columns with longest/most frequent url in group -with open (data_path / "metrics.csv", "r") as input_file, \ - open(data_path / "metrics_grps.csv", "w", newline='') as output_file: - csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - header = next(csv_reader) - header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"]) - csv_writer.writerow(header) - for row in csv_reader: - for idx, grp in enumerate(url_groups): - if row[3] in grp: - row.append(idx) - longest_in_grp = max(grp, key=len) - row.append(longest_in_grp) - row.append(Levenshtein.distance(row[6], longest_in_grp)) - most_frequent_in_grp = max(set(grp), key=grp.count) - row.append(most_frequent_in_grp) - row.append(Levenshtein.distance(row[6], most_frequent_in_grp)) - csv_writer.writerow(row) +# url_groups = [] +# while len(all_urls) > 0: +# group = take_similar(all_urls[0], all_urls) +# url_groups.append(group) +# for url in group: +# all_urls.remove(url) + +# # for every row check which group its url belongs to and add a column with group indices +# # also add columns with longest/most frequent url in group +# with open (data_path / "metrics.csv", "r") as input_file, \ +# open(data_path / "metrics_grps.csv", "w", newline='') as output_file: +# csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) +# csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) +# header = next(csv_reader) +# header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"]) +# csv_writer.writerow(header) +# for row in csv_reader: +# for idx, grp in enumerate(url_groups): +# if row[3] in grp: +# row.append(idx) +# longest_in_grp = max(grp, key=len) +# row.append(longest_in_grp) +# row.append(Levenshtein.distance(row[6], longest_in_grp)) +# most_frequent_in_grp = max(set(grp), key=grp.count) +# row.append(most_frequent_in_grp) +# row.append(Levenshtein.distance(row[6], most_frequent_in_grp)) +# csv_writer.writerow(row) -- cgit v1.2.3