summaryrefslogtreecommitdiff
path: root/bjoern
diff options
context:
space:
mode:
Diffstat (limited to 'bjoern')
-rw-r--r--bjoern/videoanalyse/post_processing.py99
1 files changed, 61 insertions, 38 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index 6ab2b0f..cc0599f 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -2,21 +2,21 @@
import argparse
from pathlib import Path
+import numpy as np
import pandas as pd
import Levenshtein
import csv
from itertools import pairwise
+from sklearn.metrics import pairwise_distances
+from sklearn.cluster import DBSCAN
+from pprint import pprint
-argparser = argparse.ArgumentParser(
- description="Distance evaluation"
-)
-argparser.add_argument(
- "vp_dir", help="Directory containing metrics.csv"
-)
+argparser = argparse.ArgumentParser(description="Distance evaluation")
+argparser.add_argument("vp_dir", help="Directory containing metrics.csv")
args = argparser.parse_args()
-data_path = Path(args.vp_dir )
+data_path = Path(args.vp_dir)
# def insertion_cost(char):
@@ -48,9 +48,7 @@ def take_similar(original, candidates):
print(original)
print(candidates)
result = [
- x
- for x in candidates
- if dist_threshold >= Levenshtein.distance(original, x)
+ x for x in candidates if dist_threshold >= Levenshtein.distance(original, x)
]
return result
@@ -60,14 +58,39 @@ def take_similar(original, candidates):
# reader = csv.reader(csvfile, quotechar='"')
# print(next(reader))
#
+
+
df = pd.read_csv(data_path / "metrics.csv")
-df = df.fillna('')
+df = df.fillna("")
# List with only urls
all_urls = list(df["url"].values)
urls = list(df["url"].values)
+
+def group_urls(urls):
+ unique_urls = np.unique(urls)
+
+ # TODO: casting deprecation np
+ def levenshtein_from_idx(idx1, idx2):
+ return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)])
+
+ X = np.searchsorted(unique_urls, list([[x] for x in urls]))
+
+ distance_matrix = pairwise_distances(
+ X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1
+ )
+ # TODO: eps and min_samples parameter
+ db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
+ labels = db.labels_
+ pprint(list(zip(urls, labels)))
+ return labels
+
+labels = group_urls(urls)
+print(list(labels))
+
+
# urls = [[0, "Start"]]
# for url in all_urls:
# if len(url[1]) > 0:
@@ -76,30 +99,30 @@ urls = list(df["url"].values)
# Iterate over list of all urls, putting similar one into a group and removing them from
# the original list
-url_groups = []
-while len(all_urls) > 0:
- group = take_similar(all_urls[0], all_urls)
- url_groups.append(group)
- for url in group:
- all_urls.remove(url)
-
-# for every row check which group its url belongs to and add a column with group indices
-# also add columns with longest/most frequent url in group
-with open (data_path / "metrics.csv", "r") as input_file, \
- open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
- csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- header = next(csv_reader)
- header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
- csv_writer.writerow(header)
- for row in csv_reader:
- for idx, grp in enumerate(url_groups):
- if row[3] in grp:
- row.append(idx)
- longest_in_grp = max(grp, key=len)
- row.append(longest_in_grp)
- row.append(Levenshtein.distance(row[6], longest_in_grp))
- most_frequent_in_grp = max(set(grp), key=grp.count)
- row.append(most_frequent_in_grp)
- row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
- csv_writer.writerow(row)
+# url_groups = []
+# while len(all_urls) > 0:
+# group = take_similar(all_urls[0], all_urls)
+# url_groups.append(group)
+# for url in group:
+# all_urls.remove(url)
+
+# # for every row check which group its url belongs to and add a column with group indices
+# # also add columns with longest/most frequent url in group
+# with open (data_path / "metrics.csv", "r") as input_file, \
+# open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
+# csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+# csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+# header = next(csv_reader)
+# header.extend(["group_index","longest","longest-distance","most_frequent","most_frequent-distance"])
+# csv_writer.writerow(header)
+# for row in csv_reader:
+# for idx, grp in enumerate(url_groups):
+# if row[3] in grp:
+# row.append(idx)
+# longest_in_grp = max(grp, key=len)
+# row.append(longest_in_grp)
+# row.append(Levenshtein.distance(row[6], longest_in_grp))
+# most_frequent_in_grp = max(set(grp), key=grp.count)
+# row.append(most_frequent_in_grp)
+# row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
+# csv_writer.writerow(row)