summaryrefslogtreecommitdiff
path: root/bjoern/videoanalyse
diff options
context:
space:
mode:
authorareyoumee <ayumilara.bischoff@stud.tu-darmstadt.de>2023-08-10 18:46:44 +0200
committerareyoumee <ayumilara.bischoff@stud.tu-darmstadt.de>2023-08-10 18:46:44 +0200
commitd35d9497f4044352ec94cb736aec9a80a8d25b19 (patch)
tree98cc65b2983bcfdac55a7b79794d6d7c406a8f47 /bjoern/videoanalyse
parentd013131cbb37125c17c0570c293ab997e7c6710e (diff)
clean up + add group metrics
Diffstat (limited to 'bjoern/videoanalyse')
-rw-r--r--bjoern/videoanalyse/post_processing.py41
1 files changed, 17 insertions, 24 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index a1baa5a..428cb5d 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -79,30 +79,23 @@ urls = list(df["url"].values)
url_groups = []
while len(all_urls) > 0:
group = take_similar(all_urls[0], all_urls)
- url_groups.append([set(group), 0])
+ url_groups.append(group)
for url in group:
all_urls.remove(url)
-# Iterate over result-elements pairwise, removing elements under distance threshold
-# and always cumulating time of url-groups
-new_urls = []
-cum_times = []
-for pair in pairwise(urls):
- print(pair)
- dist = Levenshtein.distance(pair[0], pair[1])
- if dist > dist_threshold:
- new_urls.append(pair[1])
-
-
-with open(data_path / "grouping_post.csv", "w") as csvfile:
- writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- writer.writerow(["url"])
- for line in new_urls:
- writer.writerow(line)
-
-with open(data_path / "all_urls.txt", "w") as f:
- for group in url_groups:
- f.write("=== new group, cumulative_time: {}\n".format(group[1]))
- for url in group[0]:
- f.write(url)
- f.write("\n")
+# for every row check which group its url belongs to and add a column with group indices
+# also add columns with longest/most frequent url in group
+with open (data_path / "metrics.csv", "r") as input_file, \
+ open(data_path / "metrics_grps.csv", "w", newline='') as output_file:
+ csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+ csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+ header = next(csv_reader)
+ header.extend(["group_index","longest","most frequent"])
+ csv_writer.writerow(header)
+ for row in csv_reader:
+ for idx, grp in enumerate(url_groups):
+ if row[3] in grp:
+ row.append(idx)
+ row.append(max(grp, key=len))
+ row.append(max(set(grp), key=grp.count))
+ csv_writer.writerow(row)