From 7a0e5ea2acc577e51ab7e46e424e62d6fdeef405 Mon Sep 17 00:00:00 2001
From: Niclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>
Date: Thu, 14 Sep 2023 20:29:11 +0200
Subject: use DBSCAN for url grouping

---
 bjoern/videoanalyse/post_processing.py | 99 +++++++++++++++++++++-------------
 1 file changed, 61 insertions(+), 38 deletions(-)

(limited to 'bjoern/videoanalyse/post_processing.py')

diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index 6ab2b0f..cc0599f 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -2,21 +2,21 @@
 
 import argparse
 from pathlib import Path
+import numpy as np
 import pandas as pd
 import Levenshtein
 import csv
 from itertools import pairwise
+from sklearn.metrics import pairwise_distances
+from sklearn.cluster import DBSCAN
+from pprint import pprint
 
-argparser = argparse.ArgumentParser(
-    description="Distance evaluation"
-)
-argparser.add_argument(
-    "vp_dir", help="Directory containing metrics.csv"
-)
+argparser = argparse.ArgumentParser(description="Distance evaluation")
+argparser.add_argument("vp_dir", help="Directory containing metrics.csv")
 
 args = argparser.parse_args()
 
-data_path = Path(args.vp_dir )
+data_path = Path(args.vp_dir)
 
 
 # def insertion_cost(char):
@@ -48,9 +48,7 @@ def take_similar(original, candidates):
     print(original)
     print(candidates)
     result = [
-        x
-        for x in candidates
-        if dist_threshold >= Levenshtein.distance(original, x)
+        x for x in candidates if dist_threshold >= Levenshtein.distance(original, x)
     ]
     return result
 
@@ -60,14 +58,39 @@ def take_similar(original, candidates):
 #     reader = csv.reader(csvfile, quotechar='"')
 #     print(next(reader))
 #
+
+
 df = pd.read_csv(data_path / "metrics.csv")
-df = df.fillna('')
+df = df.fillna("")
 
 
 # List with only urls
 all_urls = list(df["url"].values)
 urls = list(df["url"].values)
 
+
+def group_urls(urls):
+    unique_urls = np.unique(urls)
+
+    # TODO: casting deprecation np
+    def levenshtein_from_idx(idx1, idx2):
+        return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)])
+
+    X = np.searchsorted(unique_urls, list([[x] for x in urls]))
+
+    distance_matrix = pairwise_distances(
+        X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1
+    )
+    # TODO: eps and min_samples parameter
+    db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
+    labels = db.labels_
+    pprint(list(zip(urls, labels)))
+    return labels
+
+labels = group_urls(urls)
+print(list(labels))
+
+
 # urls = [[0, "Start"]]
 # for url in all_urls:
 #     if len(url[1]) > 0:
@@ -76,30 +99,30 @@ urls = list(df["url"].values)
 
 # Iterate over list of all urls, putting similar one into a group and removing them from
 # the original list
-url_groups = []
-while len(all_urls) > 0:
-    group = take_similar(all_urls[0], all_urls)
-    url_groups.append(group)
-    for url in group:
-        all_urls.remove(url)
-
-# for every row check which group its url belongs to and add a column with group indices
-# also add columns with longest/most frequent url in group
-with open (data_path / "metrics.csv", "r") as input_file, \
-    open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
-    csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-    csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-    header = next(csv_reader)
-    header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
-    csv_writer.writerow(header)
-    for row in csv_reader:
-        for idx, grp in enumerate(url_groups):
-            if row[3] in grp:
-                row.append(idx)
-                longest_in_grp = max(grp, key=len)
-                row.append(longest_in_grp)
-                row.append(Levenshtein.distance(row[6], longest_in_grp))
-                most_frequent_in_grp = max(set(grp), key=grp.count)
-                row.append(most_frequent_in_grp)
-                row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
-        csv_writer.writerow(row)
+# url_groups = []
+# while len(all_urls) > 0:
+#     group = take_similar(all_urls[0], all_urls)
+#     url_groups.append(group)
+#     for url in group:
+#         all_urls.remove(url)
+
+# # for every row check which group its url belongs to and add a column with group indices
+# # also add columns with longest/most frequent url in group
+# with open (data_path / "metrics.csv", "r") as input_file, \
+#     open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
+#     csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+#     csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+#     header = next(csv_reader)
+#     header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
+#     csv_writer.writerow(header)
+#     for row in csv_reader:
+#         for idx, grp in enumerate(url_groups):
+#             if row[3] in grp:
+#                 row.append(idx)
+#                 longest_in_grp = max(grp, key=len)
+#                 row.append(longest_in_grp)
+#                 row.append(Levenshtein.distance(row[6], longest_in_grp))
+#                 most_frequent_in_grp = max(set(grp), key=grp.count)
+#                 row.append(most_frequent_in_grp)
+#                 row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
+#         csv_writer.writerow(row)
-- 
cgit v1.2.3