bjoern/videoanalyse/LogParser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

# -*- coding: utf-8 -*-
"""
This script takes multiple .log data files from PCScreenWatcher and a Firefox history database (places.sqlite)
inside a folder to extract each activity title, their url if applicable and usage timestamps into a pseudo-csv file.
"""

import os
import sqlite3
import re
import datetime
from io import StringIO
import pandas as pd
import csv
import codecs


#takes the log data string and returns a list of activity titles and their time windows
def extract_activities(log_data):
    #regex which matches squared brackets
    titles = re.compile("\[.*?\]")
    #regex for total/active time
    time = re.compile("(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*")
    #regex which matches number + . and splits into different strings
    split = re.compile("\d+\. ")
    windows = split.split(log_data)
    brackets = []
    #Flag to delete next String if current endswith 'Restricted Windows Summary:' 
    #(only row with a number that needs deleting)
    del_flag = False
    #extract squared brackets per string
    for index, s in enumerate(windows):
        if del_flag:
            s = ""
            del_flag = False
        if s.endswith("Restricted Windows Summary:\n"):
            del_flag = True
        brackets.append(titles.findall(s))
        if (brackets[index]) != [] and time.findall(s) != []:
            brackets[index].insert(5, time.findall(s)[0])
    #remove empty lists
    brackets = [x for x in brackets if x != []]
    for index_a, bracket in enumerate(brackets):
        if bracket != []:
            #remove superfluous brackets like rule data
            brackets[index_a] = bracket[:1] + bracket[5:]
            for index_b, string in enumerate(brackets[index_a]):
                #remove squared brackets in each string
                if type(string) is str and string.startswith("[") and string.endswith("]"):
                    brackets[index_a][index_b] = string[1:-1]
                #remove Firefox suffix from title to match with history
                if type((brackets[index_a])[index_b]) is str and (brackets[index_a])[index_b].endswith(" - Mozilla Firefox"):
                    brackets[index_a][index_b] = brackets[index_a][index_b][:-18]
                #add string delimiters
                brackets[index_a][index_b] = "\"" + brackets[index_a][index_b] + "\""

            #print(brackets[index_a])
    #print(brackets)
    return brackets


#returns logged activities and their timestamps from a VP as a table
def get_log_data(data_path):
    #import browser history and .log files
    files = os.listdir(data_path)
    log_files = []
    for s in files:
        if s.endswith(".log"):
            log_files.append(os.path.join(data_path, s))
        
    #import log data
            log_data = ""
    for l in log_files:
        with codecs.open(l, 'r', 'utf-8') as reader:
            log_data += reader.read()
    return log_data
          
def get_history_db(data_path):
    files = os.listdir(data_path)
    for s in files:
        if s.endswith(".sqlite"):
            history_db = os.path.join(data_path, s)
    return history_db
    #extract browsing history data

#Open browsing history database
#c = sqlite3.connect(history_db)
# cursor = c.cursor()

# def show_history():
#     select_statement = "select url from moz_places;"
#     cursor.execute(select_statement)
#     results = cursor.fetchall()
#     for url in results:
#         print(url)
        
def split_lines(string): return iter(string.splitlines())

#TODO: needs complete rethinking
def aggregate_titles(data_string):
    result_table = []
    titles = []
    for row in data_string.splitlines():
        fields = row.split(",")
        print(fields[0])
        if fields[0] in titles:
            #append
            print("APPEND!!!!!!")
            pos = titles.index(fields[0])
            result_fields = result_table[pos].split(",")
            result_fields[1] += ("|" + fields[1])
            result_fields[-1] = result_fields[-1][-1]
            for timeslot in fields[2:]:
                result_fields.append(timeslot)
            tmp = ""
            for value in result_fields:
                tmp += (value + ",")
            result_table[pos] = tmp[:-1]
        else:
            titles.append(row)
            result_table.append(row)
    #print(result)
    result = ""
    for e in result_table:
        result += e
        result += "\n"
    return result
    
                
def match_urls(history_db, df):
    c = sqlite3.connect(history_db)
    cursor = c.cursor()
    select_statement = "select url from moz_places where title = ?;"
    vl_list = df["title"].values
    #print(vl_list)
    index = 0
    for index, name in enumerate(vl_list):
        cursor.execute(select_statement, (name,))
        results = cursor.fetchall()
        print(results)
        if results != []: 
            df.iloc[index, 1] = results[0]
       

choice = input("Press 'l' and enter to extract log data. Make sure this script"
               +"file is in the folder containing the VP-subfolders.")
if choice == "l":
    for dir in [f.name for f in os.scandir() if f.is_dir()]:
        print(dir)
        log = extract_activities(get_log_data(dir))
        data = ""
        for item in log:
            for s in item:
                data += "%s," % s
            data = data[:-1]
            data += "\n"
        #agg_data = aggregate_titles(data)
        col_count = 1
        for line in split_lines(data):
            commas = line.count(",") + 1
            if commas > col_count:
                col_count = commas
        #print(col_count)
        column_names = [i for i in range(0, col_count-1)]
        #table = StringIO(agg_data)
        table = StringIO(data)
        df = pd.read_csv(table, header=None, sep=',', quotechar='"', names=column_names,quoting=csv.QUOTE_ALL)
        df.insert(1, "url", "")
        df.rename(columns = {0:'title'}, inplace = True)
        df.rename(columns = {1:'total-active'}, inplace = True)

        df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
        
        match_urls(get_history_db(dir), df)
        
        #somehow parse total/active time for aggregation - no idea right now
        # df.insert(2, "active_time", 0)
        # for index, row in df.iterrows():
        #     total_string = row[1][8:13]
        #     print(total_string)
        #     #df.at[i, 2] = active_time
        #     #df.at[i, 1] = total_time
        
        df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)

    
input("*Press enter to close*")