summaryrefslogtreecommitdiff
path: root/bjoern/videoanalyse/post_processing.py
diff options
context:
space:
mode:
authorNiclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>2023-10-02 19:11:24 +0200
committerNiclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>2023-10-02 19:11:24 +0200
commitddab7c6cc5ba7e785aadb224f294284b0564acd6 (patch)
tree11e2eab853ee35a9d621c5d350cde9cfc9f74393 /bjoern/videoanalyse/post_processing.py
parent7266e2787bfa661d490be9c4463e707e6ffe1715 (diff)
refactor post processing into single script
Diffstat (limited to 'bjoern/videoanalyse/post_processing.py')
-rw-r--r--bjoern/videoanalyse/post_processing.py103
1 files changed, 13 insertions, 90 deletions
diff --git a/bjoern/videoanalyse/post_processing.py b/bjoern/videoanalyse/post_processing.py
index 94ce067..a8d37c4 100644
--- a/bjoern/videoanalyse/post_processing.py
+++ b/bjoern/videoanalyse/post_processing.py
@@ -2,105 +2,28 @@
import argparse
from pathlib import Path
-import numpy as np
-import pandas as pd
-import Levenshtein
-import csv
-from itertools import groupby
-from operator import itemgetter
-from sklearn.metrics import pairwise_distances
-from sklearn.cluster import DBSCAN
from pprint import pprint
-argparser = argparse.ArgumentParser(description="Distance evaluation")
-argparser.add_argument("vp_dir", help="Directory containing metrics.csv")
+import utils
+
+argparser = argparse.ArgumentParser(description="OCR-Logfile evaluation")
+argparser.add_argument("vp_dir", help="VP Directory")
args = argparser.parse_args()
data_path = Path(args.vp_dir)
+video_path = next(data_path.glob("*.mkv"))
+ocr_path = data_path / "analysis_results.csv"
+log_path = data_path / f"{data_path.stem}.csv"
-# Read results.csv
-# with open(data_path / "metrics.csv", "r") as csvfile:
-# reader = csv.reader(csvfile, quotechar='"')
-# print(next(reader))
-#
-
-
-df = pd.read_csv(data_path / "metrics.csv")
-df = df.fillna("")
-
-
-# List with only urls
-all_urls = list(df["url"].values)
-urls = list(df["url"].values)
-
-
-def group_urls(urls):
- unique_urls = np.unique(urls)
-
- # TODO: casting deprecation np
- def levenshtein_from_idx(idx1, idx2):
- return Levenshtein.distance(unique_urls[int(idx1)], unique_urls[int(idx2)])
-
- X = np.searchsorted(unique_urls, list([[x] for x in urls]))
-
- distance_matrix = pairwise_distances(
- X=X, Y=None, metric=levenshtein_from_idx, n_jobs=-1
- )
- # TODO: eps and min_samples parameter
- db = DBSCAN(eps=10, min_samples=5, metric="precomputed").fit(distance_matrix)
- labels = db.labels_
- zipped = zip(urls, labels)
-
- # grouping solution from: https://www.geeksforgeeks.org/python-group-tuple-into-list-based-on-value/
- # create an empty dictionary to store the grouped tuples
- grouped_dict = {}
-
- # loop through the tuples in the list
- for tup in zipped:
- # get the second element of the tuple
- key = tup[1]
- # if the key is not already in the dictionary, add it with an empty list as value
- if key not in grouped_dict:
- grouped_dict[key] = []
- # append the current tuple to the list corresponding to the key in the dictionary
- grouped_dict[key].append(tup[0])
-
- # convert the dictionary values to lists and store in res
- url_groups = [v for _, v in grouped_dict.items()]
+df = utils.combine_ocr_logs(video_path, ocr_path, log_path)
+df = df.fillna('')
- return url_groups
+df = utils.calc_levenshtein_distance(df)
-url_groups = group_urls(urls)
+url_groups = utils.group_urls(list(df["url"].values))
pprint(len(url_groups))
-# # for every row check which group its url belongs to and add a column with group indices
-# # also add columns with longest/most frequent url in group
-with open(data_path / "metrics.csv", "r") as input_file, open(
- data_path / "metrics_grps.csv", "w", newline=""
-) as output_file:
- csv_reader = csv.reader(input_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- csv_writer = csv.writer(output_file, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- header = next(csv_reader)
- header.extend(
- [
- "group_index",
- "longest",
- "longest-distance",
- "most_frequent",
- "most_frequent-distance",
- ]
- )
- csv_writer.writerow(header)
- for row in csv_reader:
- for idx, grp in enumerate(url_groups):
- if row[3] in grp:
- row.append(idx)
- longest_in_grp = max(grp, key=len)
- row.append(longest_in_grp)
- row.append(Levenshtein.distance(row[6], longest_in_grp))
- most_frequent_in_grp = max(set(grp), key=grp.count)
- row.append(most_frequent_in_grp)
- row.append(Levenshtein.distance(row[6], most_frequent_in_grp))
- csv_writer.writerow(row)
+df.to_csv(f"{data_path}/metrics.csv")
+utils.write_grouped_metrics(df, url_groups, data_path)