From ddab7c6cc5ba7e785aadb224f294284b0564acd6 Mon Sep 17 00:00:00 2001 From: Niclas Dobbertin Date: Mon, 2 Oct 2023 19:11:24 +0200 Subject: refactor post processing into single script --- bjoern/videoanalyse/post_processing.py | 103 +++++---------------------------- 1 file changed, 13 insertions(+), 90 deletions(-) (limited to 'bjoern/videoanalyse/post_processing.py') diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py index 94ce067..a8d37c4 100644 --- a/bjoern/videoanalyse/post_processing.py +++ b/bjoern/videoanalyse/post_processing.py @@ -2,105 +2,28 @@ import argparse from pathlib import Path -import numpy as np -import pandas as pd -import Levenshtein -import csv -from itertools import groupby -from operator import itemgetter -from sklearn.metrics import pairwise_distances -from sklearn.cluster import DBSCAN from pprint import pprint -argparser = argparse.ArgumentParser(description="Distance evaluation") -argparser.add_argument("vp_dir", help="Directory containing metrics.csv") +import utils + +argparser = argparse.ArgumentParser(description="OCR-Logfile evaluation") +argparser.add_argument("vp_dir", help="VP Directory") args = argparser.parse_args() data_path = Path(args.vp_dir) +video_path = next(data_path.glob("*.mkv")) +ocr_path = data_path / "analysis_results.csv" +log_path = data_path / f"{data_path.stem}.csv" -# Read results.csv -# with open(data_path / "metrics.csv", "r") as csvfile: -# reader = csv.reader(csvfile, quotechar='"') -# print(next(reader)) -# - - -df = pd.read_csv(data_path / "metrics.csv") -df = df.fillna("") - - -# List with only urls -all_urls = list(df["url"].values) -urls = list(df["url"].values) - - -def group_urls(urls): - unique_urls = np.unique(urls) - - # TODO: casting deprecation np - def levenshtein_from_idx(idx1, idx2): - return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)]) - - X = np.searchsorted(unique_urls, list([[x] for x in urls])) - - distance_matrix = pairwise_distances( - X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1 - ) - # TODO: eps and min_samples parameter - db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix) - labels = db.labels_ - zipped = zip(urls, labels) - - # grouping solution from: https://www.geeksforgeeks.org/python-group-tuple-into-list-based-on-value/ - # create an empty dictionary to store the grouped tuples - grouped_dict = {} - - # loop through the tuples in the list - for tup in zipped: - # get the second element of the tuple - key = tup[1] - # if the key is not already in the dictionary, add it with an empty list as value - if key not in grouped_dict: - grouped_dict[key] = [] - # append the current tuple to the list corresponding to the key in the dictionary - grouped_dict[key].append(tup[0]) - - # convert the dictionary values to lists and store in res - url_groups = [v for _, v in grouped_dict.items()] +df = utils.combine_ocr_logs(video_path, ocr_path, log_path) +df = df.fillna('') - return url_groups +df = utils.calc_levenshtein_distance(df) -url_groups = group_urls(urls) +url_groups = utils.group_urls(list(df["url"].values)) pprint(len(url_groups)) -# # for every row check which group its url belongs to and add a column with group indices -# # also add columns with longest/most frequent url in group -with open(data_path / "metrics.csv", "r") as input_file, open( - data_path / "metrics_grps.csv", "w", newline="" -) as output_file: - csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) - header = next(csv_reader) - header.extend( - [ - "group_index", - "longest", - "longest-distance", - "most_frequent", - "most_frequent-distance", - ] - ) - csv_writer.writerow(header) - for row in csv_reader: - for idx, grp in enumerate(url_groups): - if row[3] in grp: - row.append(idx) - longest_in_grp = max(grp, key=len) - row.append(longest_in_grp) - row.append(Levenshtein.distance(row[6], longest_in_grp)) - most_frequent_in_grp = max(set(grp), key=grp.count) - row.append(most_frequent_in_grp) - row.append(Levenshtein.distance(row[6], most_frequent_in_grp)) - csv_writer.writerow(row) +df.to_csv(f"{data_path}/metrics.csv") +utils.write_grouped_metrics(df, url_groups, data_path) -- cgit v1.2.3