From d35d9497f4044352ec94cb736aec9a80a8d25b19 Mon Sep 17 00:00:00 2001
From: areyoumee <ayumilara.bischoff@stud.tu-darmstadt.de>
Date: Thu, 10 Aug 2023 18:46:44 +0200
Subject: clean up + add group metrics

---
 bjoern/videoanalyse/post_processing.py | 41 ++++++++++++++--------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

(limited to 'bjoern/videoanalyse')

diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index a1baa5a..428cb5d 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -79,30 +79,23 @@ urls = list(df["url"].values)
 url_groups = []
 while len(all_urls) > 0:
     group = take_similar(all_urls[0], all_urls)
-    url_groups.append([set(group), 0])
+    url_groups.append(group)
     for url in group:
         all_urls.remove(url)
 
-# Iterate over result-elements pairwise, removing elements under distance threshold
-# and always cumulating time of url-groups
-new_urls = []
-cum_times = []
-for pair in pairwise(urls):
-    print(pair)
-    dist = Levenshtein.distance(pair[0], pair[1])
-    if dist > dist_threshold:
-        new_urls.append(pair[1])
-
-
-with open(data_path / "grouping_post.csv", "w") as csvfile:
-    writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
-    writer.writerow(["url"])
-    for line in new_urls:
-        writer.writerow(line)
-
-with open(data_path / "all_urls.txt", "w") as f:
-    for group in url_groups:
-        f.write("=== new group, cumulative_time: {}\n".format(group[1]))
-        for url in group[0]:
-            f.write(url)
-            f.write("\n")
+# for every row check which group its url belongs to and add a column with group indices
+# also add columns with longest/most frequent url in group
+with open (data_path / "metrics.csv", "r") as input_file, \
+    open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
+    csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+    csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+    header = next(csv_reader)
+    header.extend(["group_index","longest","most frequent"])
+    csv_writer.writerow(header)
+    for row in csv_reader:
+        for idx, grp in enumerate(url_groups):
+            if row[3] in grp:
+                row.append(idx)
+                row.append(max(grp, key=len))
+                row.append(max(set(grp), key=grp.count))
+        csv_writer.writerow(row)
-- 
cgit v1.2.3