1 files changed, 189 insertions, 0 deletions
diff --git a/bjoern/videoanalyse/LogParser.py b/bjoern/videoanalyse/LogParser.py
new file mode 100755
index 0000000..c444867
--- /dev/null
+++ b/bjoern/videoanalyse/LogParser.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+"""
+This script takes multiple .log data files from PCScreenWatcher and a Firefox history database (places.sqlite)
+inside a folder to extract each activity title, their url if applicable and usage timestamps into a pseudo-csv file.
+"""
+
+import os
+import sqlite3
+import re
+import datetime
+from io import StringIO
+import pandas as pd
+import csv
+import codecs
+
+
+
+#takes the log data string and returns a list of activity titles and their time windows
+def extract_activities(log_data):
+    #regex which matches squared brackets
+    titles = re.compile("\[.*?\]")
+    #regex for total/active time
+    time = re.compile("(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*")
+    #regex which matches number + . and splits into different strings
+    split = re.compile("\d+\. ")
+    windows = split.split(log_data)
+    brackets = []
+    #Flag to delete next String if current endswith 'Restricted Windows Summary:' 
+    #(only row with a number that needs deleting)
+    del_flag = False
+    #extract squared brackets per string
+    for index, s in enumerate(windows):
+        if del_flag:
+            s = ""
+            del_flag = False
+        if s.endswith("Restricted Windows Summary:\n"):
+            del_flag = True
+        brackets.append(titles.findall(s))
+        if (brackets[index]) != [] and time.findall(s) != []:
+            brackets[index].insert(5, time.findall(s)[0])
+    #remove empty lists
+    brackets = [x for x in brackets if x != []]
+    for index_a, bracket in enumerate(brackets):
+        if bracket != []:
+            #remove superfluous brackets like rule data
+            brackets[index_a] = bracket[:1] + bracket[5:]
+            for index_b, string in enumerate(brackets[index_a]):
+                #remove squared brackets in each string
+                if type(string) is str and string.startswith("[") and string.endswith("]"):
+                    brackets[index_a][index_b] = string[1:-1]
+                #remove Firefox suffix from title to match with history
+                if type((brackets[index_a])[index_b]) is str and (brackets[index_a])[index_b].endswith(" - Mozilla Firefox"):
+                    brackets[index_a][index_b] = brackets[index_a][index_b][:-18]
+                #add string delimiters
+                brackets[index_a][index_b] = "\"" + brackets[index_a][index_b] + "\""
+
+            #print(brackets[index_a])
+    #print(brackets)
+    return brackets
+
+
+
+#returns logged activities and their timestamps from a VP as a table
+def get_log_data(data_path):
+    #import browser history and .log files
+    files = os.listdir(data_path)
+    log_files = []
+    for s in files:
+        if s.endswith(".log"):
+            log_files.append(os.path.join(data_path, s))
+        
+    #import log data
+            log_data = ""
+    for l in log_files:
+        with codecs.open(l, 'r', 'utf-8') as reader:
+            log_data += reader.read()
+    return log_data
+          
+def get_history_db(data_path):
+    files = os.listdir(data_path)
+    for s in files:
+        if s.endswith(".sqlite"):
+            history_db = os.path.join(data_path, s)
+    return history_db
+    #extract browsing history data
+
+#Open browsing history database
+#c = sqlite3.connect(history_db)
+# cursor = c.cursor()
+
+# def show_history():
+#     select_statement = "select url from moz_places;"
+#     cursor.execute(select_statement)
+#     results = cursor.fetchall()
+#     for url in results:
+#         print(url)
+        
+def split_lines(string): return iter(string.splitlines())
+
+#TODO: needs complete rethinking
+def aggregate_titles(data_string):
+    result_table = []
+    titles = []
+    for row in data_string.splitlines():
+        fields = row.split(",")
+        print(fields[0])
+        if fields[0] in titles:
+            #append
+            print("APPEND!!!!!!")
+            pos = titles.index(fields[0])
+            result_fields = result_table[pos].split(",")
+            result_fields[1] += ("|" + fields[1])
+            result_fields[-1] = result_fields[-1][-1]
+            for timeslot in fields[2:]:
+                result_fields.append(timeslot)
+            tmp = ""
+            for value in result_fields:
+                tmp += (value + ",")
+            result_table[pos] = tmp[:-1]
+        else:
+            titles.append(row)
+            result_table.append(row)
+    #print(result)
+    result = ""
+    for e in result_table:
+        result += e
+        result += "\n"
+    return result
+    
+                
+def match_urls(history_db, df):
+    c = sqlite3.connect(history_db)
+    cursor = c.cursor()
+    select_statement = "select url from moz_places where title = ?;"
+    vl_list = df["title"].values
+    #print(vl_list)
+    index = 0
+    for index, name in enumerate(vl_list):
+        cursor.execute(select_statement, (name,))
+        results = cursor.fetchall()
+        print(results)
+        if results != []: 
+            df.iloc[index, 1] = results[0]
+       
+
+choice = input("Press 'l' and enter to extract log data. Make sure this script"
+               +"file is in the folder containing the VP-subfolders.")
+if choice == "l":
+    for dir in [f.name for f in os.scandir() if f.is_dir()]:
+        print(dir)
+        log = extract_activities(get_log_data(dir))
+        data = ""
+        for item in log:
+            for s in item:
+                data += "%s," % s
+            data = data[:-1]
+            data += "\n"
+        #agg_data = aggregate_titles(data)
+        col_count = 1
+        for line in split_lines(data):
+            commas = line.count(",") + 1
+            if commas > col_count:
+                col_count = commas
+        #print(col_count)
+        column_names = [i for i in range(0, col_count-1)]
+        #table = StringIO(agg_data)
+        table = StringIO(data)
+        df = pd.read_csv(table, header=None, sep=',', quotechar='"', names=column_names,quoting=csv.QUOTE_ALL)
+        df.insert(1, "url", "")
+        df.rename(columns = {0:'title'}, inplace = True)
+        df.rename(columns = {1:'total-active'}, inplace = True)
+
+        df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
+        
+        match_urls(get_history_db(dir), df)
+        
+        #somehow parse total/active time for aggregation - no idea right now
+        # df.insert(2, "active_time", 0)
+        # for index, row in df.iterrows():
+        #     total_string = row[1][8:13]
+        #     print(total_string)
+        #     #df.at[i, 2] = active_time
+        #     #df.at[i, 1] = total_time
+        
+        df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
+
+    
+    
+input("*Press enter to close*")