From 556dd4a49bb336eb309287d291ad36f0df90b01b Mon Sep 17 00:00:00 2001
From: Niclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>
Date: Thu, 15 Jun 2023 16:06:08 +0200
Subject: new log parser returning start/end time, title,url

---
 bjoern/videoanalyse/LogParser.py | 275 ++++++++++++++++-----------------------
 1 file changed, 114 insertions(+), 161 deletions(-)

(limited to 'bjoern/videoanalyse')

diff --git a/bjoern/videoanalyse/LogParser.py b/bjoern/videoanalyse/LogParser.py
index c444867..c580cfe 100755
--- a/bjoern/videoanalyse/LogParser.py
+++ b/bjoern/videoanalyse/LogParser.py
@@ -5,185 +5,138 @@ inside a folder to extract each activity title, their url if applicable and usag
 """
 
 import os
-import sqlite3
 import re
-import datetime
-from io import StringIO
-import pandas as pd
+from datetime import datetime
 import csv
 import codecs
+from read_sqlite import get_url_from_sqlite
+from pathlib import Path
 
 
-
-#takes the log data string and returns a list of activity titles and their time windows
+# takes the log data string and returns a list of activity titles and their time windows
 def extract_activities(log_data):
-    #regex which matches squared brackets
-    titles = re.compile("\[.*?\]")
-    #regex for total/active time
-    time = re.compile("(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*")
-    #regex which matches number + . and splits into different strings
-    split = re.compile("\d+\. ")
-    windows = split.split(log_data)
-    brackets = []
-    #Flag to delete next String if current endswith 'Restricted Windows Summary:' 
-    #(only row with a number that needs deleting)
-    del_flag = False
-    #extract squared brackets per string
-    for index, s in enumerate(windows):
-        if del_flag:
-            s = ""
-            del_flag = False
-        if s.endswith("Restricted Windows Summary:\n"):
-            del_flag = True
-        brackets.append(titles.findall(s))
-        if (brackets[index]) != [] and time.findall(s) != []:
-            brackets[index].insert(5, time.findall(s)[0])
-    #remove empty lists
-    brackets = [x for x in brackets if x != []]
-    for index_a, bracket in enumerate(brackets):
-        if bracket != []:
-            #remove superfluous brackets like rule data
-            brackets[index_a] = bracket[:1] + bracket[5:]
-            for index_b, string in enumerate(brackets[index_a]):
-                #remove squared brackets in each string
-                if type(string) is str and string.startswith("[") and string.endswith("]"):
-                    brackets[index_a][index_b] = string[1:-1]
-                #remove Firefox suffix from title to match with history
-                if type((brackets[index_a])[index_b]) is str and (brackets[index_a])[index_b].endswith(" - Mozilla Firefox"):
-                    brackets[index_a][index_b] = brackets[index_a][index_b][:-18]
-                #add string delimiters
-                brackets[index_a][index_b] = "\"" + brackets[index_a][index_b] + "\""
-
-            #print(brackets[index_a])
-    #print(brackets)
-    return brackets
-
-
-
-#returns logged activities and their timestamps from a VP as a table
+    # regex which matches between squared brackets
+    reg_titles = re.compile("(?<=\[).*?(?=\])")
+    # regex for total/active time
+    reg_time = re.compile(
+        "(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*"
+    )
+    # regex which matches number + . and splits into different strings
+    reg_split = re.compile("\d+\. ")
+
+    extracted_data = {}
+    for log_date in log_data:
+        windows = reg_split.split(log_date)
+        brackets = []
+
+        del_flag = True
+        # extract squared brackets per string
+        for s in windows:
+            if not del_flag:
+                print("STRING: ", s)
+                found_brackets = reg_titles.findall(s)
+                if found_brackets:
+                    if found_brackets[0].endswith(" - Mozilla Firefox"):
+                        title = found_brackets[0].replace(" - Mozilla Firefox", "")
+                        brackets.append(title)
+                        enter_exit = s.split("Enter-Exit: ")[-1]
+                        timestamps = reg_titles.findall(enter_exit)
+                        print("ENTER: ", timestamps)
+                        for timestamp in timestamps:
+                            print(timestamp.split("-"))
+                            t_enter, t_exit = timestamp.split("-")
+                            if not title in extracted_data:
+                                extracted_data[title] = []
+                            enter_date = datetime.strptime(t_enter, "%H:%M:%S").time()
+                            exit_date = datetime.strptime(t_exit, "%H:%M:%S").time()
+                            extracted_data[title].append((enter_date, exit_date))
+            if "Activies in each window" in s:
+                del_flag = False
+            if "Restricted Windows Summary:" in s:
+                del_flag = True
+        print("-------------------------FINISH----------------------------------------")
+        print(extracted_data)
+
+    return extracted_data
+
+
+# returns logged activities and their timestamps from a VP as a table
 def get_log_data(data_path):
-    #import browser history and .log files
+    # import browser history and .log files
     files = os.listdir(data_path)
     log_files = []
+    log_data = []
     for s in files:
         if s.endswith(".log"):
             log_files.append(os.path.join(data_path, s))
-        
-    #import log data
-            log_data = ""
     for l in log_files:
-        with codecs.open(l, 'r', 'utf-8') as reader:
-            log_data += reader.read()
+        with codecs.open(l, "r", "utf-8") as reader:
+            log_data.append(reader.read())
     return log_data
-          
+
+
 def get_history_db(data_path):
     files = os.listdir(data_path)
     for s in files:
         if s.endswith(".sqlite"):
             history_db = os.path.join(data_path, s)
-    return history_db
-    #extract browsing history data
-
-#Open browsing history database
-#c = sqlite3.connect(history_db)
-# cursor = c.cursor()
-
-# def show_history():
-#     select_statement = "select url from moz_places;"
-#     cursor.execute(select_statement)
-#     results = cursor.fetchall()
-#     for url in results:
-#         print(url)
-        
-def split_lines(string): return iter(string.splitlines())
-
-#TODO: needs complete rethinking
-def aggregate_titles(data_string):
-    result_table = []
-    titles = []
-    for row in data_string.splitlines():
-        fields = row.split(",")
-        print(fields[0])
-        if fields[0] in titles:
-            #append
-            print("APPEND!!!!!!")
-            pos = titles.index(fields[0])
-            result_fields = result_table[pos].split(",")
-            result_fields[1] += ("|" + fields[1])
-            result_fields[-1] = result_fields[-1][-1]
-            for timeslot in fields[2:]:
-                result_fields.append(timeslot)
-            tmp = ""
-            for value in result_fields:
-                tmp += (value + ",")
-            result_table[pos] = tmp[:-1]
-        else:
-            titles.append(row)
-            result_table.append(row)
-    #print(result)
-    result = ""
-    for e in result_table:
-        result += e
-        result += "\n"
-    return result
-    
-                
-def match_urls(history_db, df):
-    c = sqlite3.connect(history_db)
-    cursor = c.cursor()
-    select_statement = "select url from moz_places where title = ?;"
-    vl_list = df["title"].values
-    #print(vl_list)
-    index = 0
-    for index, name in enumerate(vl_list):
-        cursor.execute(select_statement, (name,))
-        results = cursor.fetchall()
-        print(results)
-        if results != []: 
-            df.iloc[index, 1] = results[0]
-       
-
-choice = input("Press 'l' and enter to extract log data. Make sure this script"
-               +"file is in the folder containing the VP-subfolders.")
+            return history_db
+    return None
+
+
+def match_urls(history_db, log):
+    for entry in log:
+        url = get_url_from_sqlite(history_db, entry[2])
+        entry.append(url)
+    return log
+
+def generate_log(activities: dict):
+    log = []
+    while activities:
+        first_title = list(activities.keys())[0]
+        smallest_start_time = (first_title, 0)
+        for title in activities.keys():
+            for idx, timestamp in enumerate(activities[title]):
+                if (
+                    timestamp[0]
+                    < activities[smallest_start_time[0]][smallest_start_time[1]][0]
+                ):
+                    smallest_start_time = (title, idx)
+        # print(
+        #     f"title: {smallest_start_time[0]}, time: {activities[smallest_start_time[0]][smallest_start_time[1]]}"
+        # )
+        log.append(
+            [
+                activities[smallest_start_time[0]][smallest_start_time[1]][0].isoformat(),
+                activities[smallest_start_time[0]][smallest_start_time[1]][0].isoformat(),
+                smallest_start_time[0],
+            ]
+        )
+        del activities[smallest_start_time[0]][smallest_start_time[1]]
+        if not activities[smallest_start_time[0]]:
+            del activities[smallest_start_time[0]]
+    return(log)
+
+
+choice = input(
+    "Press 'l' and enter to extract log data. Make sure this script"
+    + "file is in the folder containing the VP-subfolders."
+)
 if choice == "l":
-    for dir in [f.name for f in os.scandir() if f.is_dir()]:
-        print(dir)
-        log = extract_activities(get_log_data(dir))
-        data = ""
-        for item in log:
-            for s in item:
-                data += "%s," % s
-            data = data[:-1]
-            data += "\n"
-        #agg_data = aggregate_titles(data)
-        col_count = 1
-        for line in split_lines(data):
-            commas = line.count(",") + 1
-            if commas > col_count:
-                col_count = commas
-        #print(col_count)
-        column_names = [i for i in range(0, col_count-1)]
-        #table = StringIO(agg_data)
-        table = StringIO(data)
-        df = pd.read_csv(table, header=None, sep=',', quotechar='"', names=column_names,quoting=csv.QUOTE_ALL)
-        df.insert(1, "url", "")
-        df.rename(columns = {0:'title'}, inplace = True)
-        df.rename(columns = {1:'total-active'}, inplace = True)
-
-        df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
-        
-        match_urls(get_history_db(dir), df)
-        
-        #somehow parse total/active time for aggregation - no idea right now
-        # df.insert(2, "active_time", 0)
-        # for index, row in df.iterrows():
-        #     total_string = row[1][8:13]
-        #     print(total_string)
-        #     #df.at[i, 2] = active_time
-        #     #df.at[i, 1] = total_time
-        
-        df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
-
-    
-    
+    for vp_dir in [f.name for f in os.scandir() if f.is_dir()]:
+        print(vp_dir)
+        log = extract_activities(get_log_data(vp_dir))
+        log = generate_log(log)
+
+        history = get_history_db(vp_dir)
+        log = match_urls(history, log)
+
+        path = Path(f"{vp_dir}/{vp_dir}.csv")
+        with open(path, "w") as csvfile:
+            writer = csv.writer(csvfile, delimiter=",", quoting=csv.QUOTE_NONNUMERIC)
+            writer.writerow(["Starttime", "Endtime", "Title", "URL"])
+            for row in log:
+                writer.writerow(row)
+
+
 input("*Press enter to close*")
-- 
cgit v1.2.3