1 files changed, 48 insertions, 70 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index cc0599f..94ce067 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -6,7 +6,8 @@ import numpy as np
 import pandas as pd
 import Levenshtein
 import csv
-from itertools import pairwise
+from itertools import groupby
+from operator import itemgetter
 from sklearn.metrics import pairwise_distances
 from sklearn.cluster import DBSCAN
 from pprint import pprint
@@ -18,41 +19,6 @@ args = argparser.parse_args()
 
 data_path = Path(args.vp_dir)
 
-
-# def insertion_cost(char):
-#     return 1.0
-
-
-# def deletion_cost(char):
-#     return 1.0
-
-
-# def substitution_cost(char_a, char_b):
-#     if char_a == "t" and char_b == "r":
-#         return 0.5
-#     return 1.0
-
-
-# weighted_levenshtein = WeightedLevenshtein(
-#     substitution_cost_fn=substitution_cost,
-#     insertion_cost_fn=insertion_cost,
-#     deletion_cost_fn=deletion_cost,
-# )
-
-# Distance threshold to define "same" url
-dist_threshold = 5
-
-
-# Function to return all elements in candidates that are similar to original
-def take_similar(original, candidates):
-    print(original)
-    print(candidates)
-    result = [
-        x for x in candidates if dist_threshold >= Levenshtein.distance(original, x)
-    ]
-    return result
-
-
 # Read results.csv
 # with open(data_path / "metrics.csv", "r") as csvfile:
 #     reader = csv.reader(csvfile, quotechar='"')
@@ -84,45 +50,57 @@ def group_urls(urls):
     # TODO: eps and min_samples parameter
     db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
     labels = db.labels_
-    pprint(list(zip(urls, labels)))
-    return labels
+    zipped = zip(urls, labels)
+
+    # grouping solution from: https://www.geeksforgeeks.org/python-group-tuple-into-list-based-on-value/
+    # create an empty dictionary to store the grouped tuples
+    grouped_dict = {}
 
-labels = group_urls(urls)
-print(list(labels))
+    # loop through the tuples in the list
+    for tup in zipped:
+        # get the second element of the tuple
+        key = tup[1]
+        # if the key is not already in the dictionary, add it with an empty list as value
+        if key not in grouped_dict:
+            grouped_dict[key] = []
+        # append the current tuple to the list corresponding to the key in the dictionary
+        grouped_dict[key].append(tup[0])
 
+    # convert the dictionary values to lists and store in res
+    url_groups = [v for _, v in grouped_dict.items()]
 
-# urls = [[0, "Start"]]
-# for url in all_urls:
-#     if len(url[1]) > 0:
-#         urls.append([float(url[0]), url[1]])
+    return url_groups
 
 
-# Iterate over list of all urls, putting similar one into a group and removing them from
-# the original list
-# url_groups = []
-# while len(all_urls) > 0:
-#     group = take_similar(all_urls[0], all_urls)
-#     url_groups.append(group)
-#     for url in group:
-#         all_urls.remove(url)
+url_groups = group_urls(urls)
+pprint(len(url_groups))
 
 # # for every row check which group its url belongs to and add a column with group indices
 # # also add columns with longest/most frequent url in group
-# with open (data_path / "metrics.csv", "r") as input_file, \
-#     open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
-#     csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-#     csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-#     header = next(csv_reader)
-#     header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
-#     csv_writer.writerow(header)
-#     for row in csv_reader:
-#         for idx, grp in enumerate(url_groups):
-#             if row[3] in grp:
-#                 row.append(idx)
-#                 longest_in_grp = max(grp, key=len)
-#                 row.append(longest_in_grp)
-#                 row.append(Levenshtein.distance(row[6], longest_in_grp))
-#                 most_frequent_in_grp = max(set(grp), key=grp.count)
-#                 row.append(most_frequent_in_grp)
-#                 row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
-#         csv_writer.writerow(row)
+with open(data_path / "metrics.csv", "r") as input_file, open(
+    data_path / "metrics_grps.csv", "w", newline=""
+) as output_file:
+    csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+    csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+    header = next(csv_reader)
+    header.extend(
+        [
+            "group_index",
+            "longest",
+            "longest-distance",
+            "most_frequent",
+            "most_frequent-distance",
+        ]
+    )
+    csv_writer.writerow(header)
+    for row in csv_reader:
+        for idx, grp in enumerate(url_groups):
+            if row[3] in grp:
+                row.append(idx)
+                longest_in_grp = max(grp, key=len)
+                row.append(longest_in_grp)
+                row.append(Levenshtein.distance(row[6], longest_in_grp))
+                most_frequent_in_grp = max(set(grp), key=grp.count)
+                row.append(most_frequent_in_grp)
+                row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
+        csv_writer.writerow(row)