diff options
Diffstat (limited to 'bjoern/videoanalyse')
| -rw-r--r-- | bjoern/videoanalyse/post_processing.py | 41 | 
1 files changed, 17 insertions, 24 deletions
| diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index a1baa5a..428cb5d 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -79,30 +79,23 @@ urls = list(df["url"].values)  url_groups = []  while len(all_urls) > 0:      group = take_similar(all_urls[0], all_urls) -    url_groups.append([set(group), 0]) +    url_groups.append(group)      for url in group:          all_urls.remove(url) -# Iterate over result-elements pairwise, removing elements under distance threshold -# and always cumulating time of url-groups -new_urls = [] -cum_times = [] -for pair in pairwise(urls): -    print(pair) -    dist = Levenshtein.distance(pair[0], pair[1]) -    if dist > dist_threshold: -        new_urls.append(pair[1]) - - -with open(data_path / "grouping_post.csv", "w") as csvfile: -    writer = csv.writer(csvfile, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) -    writer.writerow(["url"]) -    for line in new_urls: -        writer.writerow(line) - -with open(data_path / "all_urls.txt", "w") as f: -    for group in url_groups: -        f.write("=== new group, cumulative_time: {}\n".format(group[1])) -        for url in group[0]: -            f.write(url) -            f.write("\n") +# for every row check which group its url belongs to and add a column with group indices +# also add columns with longest/most frequent url in group +with open (data_path / "metrics.csv", "r") as input_file, \ +    open(data_path / "metrics_grps.csv", "w", newline='') as output_file: +    csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) +    csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) +    header = next(csv_reader) +    header.extend(["group_index","longest","most frequent"]) +    csv_writer.writerow(header) +    for row in csv_reader: +        for idx, grp in enumerate(url_groups): +            if row[3] in grp: +                row.append(idx) +                row.append(max(grp, key=len)) +                row.append(max(set(grp), key=grp.count)) +        csv_writer.writerow(row) | 
