diff options
Diffstat (limited to 'bjoern/videoanalyse/LogParser.py')
-rwxr-xr-x | bjoern/videoanalyse/LogParser.py | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/bjoern/videoanalyse/LogParser.py b/bjoern/videoanalyse/LogParser.py new file mode 100755 index 0000000..c444867 --- /dev/null +++ b/bjoern/videoanalyse/LogParser.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*-
+"""
+This script takes multiple .log data files from PCScreenWatcher and a Firefox history database (places.sqlite)
+inside a folder to extract each activity title, their url if applicable and usage timestamps into a pseudo-csv file.
+"""
+
+import os
+import sqlite3
+import re
+import datetime
+from io import StringIO
+import pandas as pd
+import csv
+import codecs
+
+
+
+#takes the log data string and returns a list of activity titles and their time windows
+def extract_activities(log_data):
+ #regex which matches squared brackets
+ titles = re.compile("\[.*?\]")
+ #regex for total/active time
+ time = re.compile("(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*")
+ #regex which matches number + . and splits into different strings
+ split = re.compile("\d+\. ")
+ windows = split.split(log_data)
+ brackets = []
+ #Flag to delete next String if current endswith 'Restricted Windows Summary:'
+ #(only row with a number that needs deleting)
+ del_flag = False
+ #extract squared brackets per string
+ for index, s in enumerate(windows):
+ if del_flag:
+ s = ""
+ del_flag = False
+ if s.endswith("Restricted Windows Summary:\n"):
+ del_flag = True
+ brackets.append(titles.findall(s))
+ if (brackets[index]) != [] and time.findall(s) != []:
+ brackets[index].insert(5, time.findall(s)[0])
+ #remove empty lists
+ brackets = [x for x in brackets if x != []]
+ for index_a, bracket in enumerate(brackets):
+ if bracket != []:
+ #remove superfluous brackets like rule data
+ brackets[index_a] = bracket[:1] + bracket[5:]
+ for index_b, string in enumerate(brackets[index_a]):
+ #remove squared brackets in each string
+ if type(string) is str and string.startswith("[") and string.endswith("]"):
+ brackets[index_a][index_b] = string[1:-1]
+ #remove Firefox suffix from title to match with history
+ if type((brackets[index_a])[index_b]) is str and (brackets[index_a])[index_b].endswith(" - Mozilla Firefox"):
+ brackets[index_a][index_b] = brackets[index_a][index_b][:-18]
+ #add string delimiters
+ brackets[index_a][index_b] = "\"" + brackets[index_a][index_b] + "\""
+
+ #print(brackets[index_a])
+ #print(brackets)
+ return brackets
+
+
+
+#returns logged activities and their timestamps from a VP as a table
+def get_log_data(data_path):
+ #import browser history and .log files
+ files = os.listdir(data_path)
+ log_files = []
+ for s in files:
+ if s.endswith(".log"):
+ log_files.append(os.path.join(data_path, s))
+
+ #import log data
+ log_data = ""
+ for l in log_files:
+ with codecs.open(l, 'r', 'utf-8') as reader:
+ log_data += reader.read()
+ return log_data
+
+def get_history_db(data_path):
+ files = os.listdir(data_path)
+ for s in files:
+ if s.endswith(".sqlite"):
+ history_db = os.path.join(data_path, s)
+ return history_db
+ #extract browsing history data
+
+#Open browsing history database
+#c = sqlite3.connect(history_db)
+# cursor = c.cursor()
+
+# def show_history():
+# select_statement = "select url from moz_places;"
+# cursor.execute(select_statement)
+# results = cursor.fetchall()
+# for url in results:
+# print(url)
+
+def split_lines(string): return iter(string.splitlines())
+
+#TODO: needs complete rethinking
+def aggregate_titles(data_string):
+ result_table = []
+ titles = []
+ for row in data_string.splitlines():
+ fields = row.split(",")
+ print(fields[0])
+ if fields[0] in titles:
+ #append
+ print("APPEND!!!!!!")
+ pos = titles.index(fields[0])
+ result_fields = result_table[pos].split(",")
+ result_fields[1] += ("|" + fields[1])
+ result_fields[-1] = result_fields[-1][-1]
+ for timeslot in fields[2:]:
+ result_fields.append(timeslot)
+ tmp = ""
+ for value in result_fields:
+ tmp += (value + ",")
+ result_table[pos] = tmp[:-1]
+ else:
+ titles.append(row)
+ result_table.append(row)
+ #print(result)
+ result = ""
+ for e in result_table:
+ result += e
+ result += "\n"
+ return result
+
+
+def match_urls(history_db, df):
+ c = sqlite3.connect(history_db)
+ cursor = c.cursor()
+ select_statement = "select url from moz_places where title = ?;"
+ vl_list = df["title"].values
+ #print(vl_list)
+ index = 0
+ for index, name in enumerate(vl_list):
+ cursor.execute(select_statement, (name,))
+ results = cursor.fetchall()
+ print(results)
+ if results != []:
+ df.iloc[index, 1] = results[0]
+
+
+choice = input("Press 'l' and enter to extract log data. Make sure this script"
+ +"file is in the folder containing the VP-subfolders.")
+if choice == "l":
+ for dir in [f.name for f in os.scandir() if f.is_dir()]:
+ print(dir)
+ log = extract_activities(get_log_data(dir))
+ data = ""
+ for item in log:
+ for s in item:
+ data += "%s," % s
+ data = data[:-1]
+ data += "\n"
+ #agg_data = aggregate_titles(data)
+ col_count = 1
+ for line in split_lines(data):
+ commas = line.count(",") + 1
+ if commas > col_count:
+ col_count = commas
+ #print(col_count)
+ column_names = [i for i in range(0, col_count-1)]
+ #table = StringIO(agg_data)
+ table = StringIO(data)
+ df = pd.read_csv(table, header=None, sep=',', quotechar='"', names=column_names,quoting=csv.QUOTE_ALL)
+ df.insert(1, "url", "")
+ df.rename(columns = {0:'title'}, inplace = True)
+ df.rename(columns = {1:'total-active'}, inplace = True)
+
+ df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
+
+ match_urls(get_history_db(dir), df)
+
+ #somehow parse total/active time for aggregation - no idea right now
+ # df.insert(2, "active_time", 0)
+ # for index, row in df.iterrows():
+ # total_string = row[1][8:13]
+ # print(total_string)
+ # #df.at[i, 2] = active_time
+ # #df.at[i, 1] = total_time
+
+ df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
+
+
+
+input("*Press enter to close*")
|