diff options
Diffstat (limited to 'bjoern/videoanalyse')
-rw-r--r-- | bjoern/videoanalyse/post_processing.py | 41 |
1 files changed, 17 insertions, 24 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index a1baa5a..428cb5d 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -79,30 +79,23 @@ urls = list(df["url"].values) url_groups = [] while len(all_urls) > 0: group = take_similar(all_urls[0], all_urls) - url_groups.append([set(group), 0]) + url_groups.append(group) for url in group: all_urls.remove(url) -# Iterate over result-elements pairwise, removing elements under distance threshold -# and always cumulating time of url-groups -new_urls = [] -cum_times = [] -for pair in pairwise(urls): - print(pair) - dist = Levenshtein.distance(pair[0], pair[1]) - if dist > dist_threshold: - new_urls.append(pair[1]) - - -with open(data_path / "grouping_post.csv", "w") as csvfile: - writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - writer.writerow(["url"]) - for line in new_urls: - writer.writerow(line) - -with open(data_path / "all_urls.txt", "w") as f: - for group in url_groups: - f.write("=== new group, cumulative_time: {}\n".format(group[1])) - for url in group[0]: - f.write(url) - f.write("\n") +# for every row check which group its url belongs to and add a column with group indices +# also add columns with longest/most frequent url in group +with open (data_path / "metrics.csv", "r") as input_file, \ + open(data_path / "metrics_grps.csv", "w", newline='') as output_file: + csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + header = next(csv_reader) + header.extend(["group_index","longest","most frequent"]) + csv_writer.writerow(header) + for row in csv_reader: + for idx, grp in enumerate(url_groups): + if row[3] in grp: + row.append(idx) + row.append(max(grp, key=len)) + row.append(max(set(grp), key=grp.count)) + csv_writer.writerow(row) |