summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>2023-06-15 16:06:08 +0200
committerNiclas Dobbertin <niclas.dobbertin@stud.tu-darmstadt.de>2023-06-15 16:06:08 +0200
commit556dd4a49bb336eb309287d291ad36f0df90b01b (patch)
treeaed5a35343f4d15d59cab489df42528f6fae234e
parentb6b5918493260518ee4b868ea03e5316caca40aa (diff)
new log parser returning start/end time, title,url
-rwxr-xr-xbjoern/videoanalyse/LogParser.py275
1 files changed, 114 insertions, 161 deletions
diff --git a/bjoern/videoanalyse/LogParser.py b/bjoern/videoanalyse/LogParser.py
index c444867..c580cfe 100755
--- a/bjoern/videoanalyse/LogParser.py
+++ b/bjoern/videoanalyse/LogParser.py
@@ -5,185 +5,138 @@ inside a folder to extract each activity title, their url if applicable and usag
"""
import os
-import sqlite3
import re
-import datetime
-from io import StringIO
-import pandas as pd
+from datetime import datetime
import csv
import codecs
+from read_sqlite import get_url_from_sqlite
+from pathlib import Path
-
-#takes the log data string and returns a list of activity titles and their time windows
+# takes the log data string and returns a list of activity titles and their time windows
def extract_activities(log_data):
- #regex which matches squared brackets
- titles = re.compile("\[.*?\]")
- #regex for total/active time
- time = re.compile("(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*")
- #regex which matches number + . and splits into different strings
- split = re.compile("\d+\. ")
- windows = split.split(log_data)
- brackets = []
- #Flag to delete next String if current endswith 'Restricted Windows Summary:'
- #(only row with a number that needs deleting)
- del_flag = False
- #extract squared brackets per string
- for index, s in enumerate(windows):
- if del_flag:
- s = ""
- del_flag = False
- if s.endswith("Restricted Windows Summary:\n"):
- del_flag = True
- brackets.append(titles.findall(s))
- if (brackets[index]) != [] and time.findall(s) != []:
- brackets[index].insert(5, time.findall(s)[0])
- #remove empty lists
- brackets = [x for x in brackets if x != []]
- for index_a, bracket in enumerate(brackets):
- if bracket != []:
- #remove superfluous brackets like rule data
- brackets[index_a] = bracket[:1] + bracket[5:]
- for index_b, string in enumerate(brackets[index_a]):
- #remove squared brackets in each string
- if type(string) is str and string.startswith("[") and string.endswith("]"):
- brackets[index_a][index_b] = string[1:-1]
- #remove Firefox suffix from title to match with history
- if type((brackets[index_a])[index_b]) is str and (brackets[index_a])[index_b].endswith(" - Mozilla Firefox"):
- brackets[index_a][index_b] = brackets[index_a][index_b][:-18]
- #add string delimiters
- brackets[index_a][index_b] = "\"" + brackets[index_a][index_b] + "\""
-
- #print(brackets[index_a])
- #print(brackets)
- return brackets
-
-
-
-#returns logged activities and their timestamps from a VP as a table
+ # regex which matches between squared brackets
+ reg_titles = re.compile("(?<=\[).*?(?=\])")
+ # regex for total/active time
+ reg_time = re.compile(
+ "(?<!\()Total: \d*\.*\d*m*\d*\.*\d*s*, Active: \d*\.*\d*m*\d*\.*\d*s*"
+ )
+ # regex which matches number + . and splits into different strings
+ reg_split = re.compile("\d+\. ")
+
+ extracted_data = {}
+ for log_date in log_data:
+ windows = reg_split.split(log_date)
+ brackets = []
+
+ del_flag = True
+ # extract squared brackets per string
+ for s in windows:
+ if not del_flag:
+ print("STRING: ", s)
+ found_brackets = reg_titles.findall(s)
+ if found_brackets:
+ if found_brackets[0].endswith(" - Mozilla Firefox"):
+ title = found_brackets[0].replace(" - Mozilla Firefox", "")
+ brackets.append(title)
+ enter_exit = s.split("Enter-Exit: ")[-1]
+ timestamps = reg_titles.findall(enter_exit)
+ print("ENTER: ", timestamps)
+ for timestamp in timestamps:
+ print(timestamp.split("-"))
+ t_enter, t_exit = timestamp.split("-")
+ if not title in extracted_data:
+ extracted_data[title] = []
+ enter_date = datetime.strptime(t_enter, "%H:%M:%S").time()
+ exit_date = datetime.strptime(t_exit, "%H:%M:%S").time()
+ extracted_data[title].append((enter_date, exit_date))
+ if "Activies in each window" in s:
+ del_flag = False
+ if "Restricted Windows Summary:" in s:
+ del_flag = True
+ print("-------------------------FINISH----------------------------------------")
+ print(extracted_data)
+
+ return extracted_data
+
+
+# returns logged activities and their timestamps from a VP as a table
def get_log_data(data_path):
- #import browser history and .log files
+ # import browser history and .log files
files = os.listdir(data_path)
log_files = []
+ log_data = []
for s in files:
if s.endswith(".log"):
log_files.append(os.path.join(data_path, s))
-
- #import log data
- log_data = ""
for l in log_files:
- with codecs.open(l, 'r', 'utf-8') as reader:
- log_data += reader.read()
+ with codecs.open(l, "r", "utf-8") as reader:
+ log_data.append(reader.read())
return log_data
-
+
+
def get_history_db(data_path):
files = os.listdir(data_path)
for s in files:
if s.endswith(".sqlite"):
history_db = os.path.join(data_path, s)
- return history_db
- #extract browsing history data
-
-#Open browsing history database
-#c = sqlite3.connect(history_db)
-# cursor = c.cursor()
-
-# def show_history():
-# select_statement = "select url from moz_places;"
-# cursor.execute(select_statement)
-# results = cursor.fetchall()
-# for url in results:
-# print(url)
-
-def split_lines(string): return iter(string.splitlines())
-
-#TODO: needs complete rethinking
-def aggregate_titles(data_string):
- result_table = []
- titles = []
- for row in data_string.splitlines():
- fields = row.split(",")
- print(fields[0])
- if fields[0] in titles:
- #append
- print("APPEND!!!!!!")
- pos = titles.index(fields[0])
- result_fields = result_table[pos].split(",")
- result_fields[1] += ("|" + fields[1])
- result_fields[-1] = result_fields[-1][-1]
- for timeslot in fields[2:]:
- result_fields.append(timeslot)
- tmp = ""
- for value in result_fields:
- tmp += (value + ",")
- result_table[pos] = tmp[:-1]
- else:
- titles.append(row)
- result_table.append(row)
- #print(result)
- result = ""
- for e in result_table:
- result += e
- result += "\n"
- return result
-
-
-def match_urls(history_db, df):
- c = sqlite3.connect(history_db)
- cursor = c.cursor()
- select_statement = "select url from moz_places where title = ?;"
- vl_list = df["title"].values
- #print(vl_list)
- index = 0
- for index, name in enumerate(vl_list):
- cursor.execute(select_statement, (name,))
- results = cursor.fetchall()
- print(results)
- if results != []:
- df.iloc[index, 1] = results[0]
-
-
-choice = input("Press 'l' and enter to extract log data. Make sure this script"
- +"file is in the folder containing the VP-subfolders.")
+ return history_db
+ return None
+
+
+def match_urls(history_db, log):
+ for entry in log:
+ url = get_url_from_sqlite(history_db, entry[2])
+ entry.append(url)
+ return log
+
+def generate_log(activities: dict):
+ log = []
+ while activities:
+ first_title = list(activities.keys())[0]
+ smallest_start_time = (first_title, 0)
+ for title in activities.keys():
+ for idx, timestamp in enumerate(activities[title]):
+ if (
+ timestamp[0]
+ < activities[smallest_start_time[0]][smallest_start_time[1]][0]
+ ):
+ smallest_start_time = (title, idx)
+ # print(
+ # f"title: {smallest_start_time[0]}, time: {activities[smallest_start_time[0]][smallest_start_time[1]]}"
+ # )
+ log.append(
+ [
+ activities[smallest_start_time[0]][smallest_start_time[1]][0].isoformat(),
+ activities[smallest_start_time[0]][smallest_start_time[1]][0].isoformat(),
+ smallest_start_time[0],
+ ]
+ )
+ del activities[smallest_start_time[0]][smallest_start_time[1]]
+ if not activities[smallest_start_time[0]]:
+ del activities[smallest_start_time[0]]
+ return(log)
+
+
+choice = input(
+ "Press 'l' and enter to extract log data. Make sure this script"
+ + "file is in the folder containing the VP-subfolders."
+)
if choice == "l":
- for dir in [f.name for f in os.scandir() if f.is_dir()]:
- print(dir)
- log = extract_activities(get_log_data(dir))
- data = ""
- for item in log:
- for s in item:
- data += "%s," % s
- data = data[:-1]
- data += "\n"
- #agg_data = aggregate_titles(data)
- col_count = 1
- for line in split_lines(data):
- commas = line.count(",") + 1
- if commas > col_count:
- col_count = commas
- #print(col_count)
- column_names = [i for i in range(0, col_count-1)]
- #table = StringIO(agg_data)
- table = StringIO(data)
- df = pd.read_csv(table, header=None, sep=',', quotechar='"', names=column_names,quoting=csv.QUOTE_ALL)
- df.insert(1, "url", "")
- df.rename(columns = {0:'title'}, inplace = True)
- df.rename(columns = {1:'total-active'}, inplace = True)
-
- df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
-
- match_urls(get_history_db(dir), df)
-
- #somehow parse total/active time for aggregation - no idea right now
- # df.insert(2, "active_time", 0)
- # for index, row in df.iterrows():
- # total_string = row[1][8:13]
- # print(total_string)
- # #df.at[i, 2] = active_time
- # #df.at[i, 1] = total_time
-
- df.to_csv('%s/%s.csv' % (dir, dir), sep=';', quoting=csv.QUOTE_ALL)
-
-
-
+ for vp_dir in [f.name for f in os.scandir() if f.is_dir()]:
+ print(vp_dir)
+ log = extract_activities(get_log_data(vp_dir))
+ log = generate_log(log)
+
+ history = get_history_db(vp_dir)
+ log = match_urls(history, log)
+
+ path = Path(f"{vp_dir}/{vp_dir}.csv")
+ with open(path, "w") as csvfile:
+ writer = csv.writer(csvfile, delimiter=",", quoting=csv.QUOTE_NONNUMERIC)
+ writer.writerow(["Starttime", "Endtime", "Title", "URL"])
+ for row in log:
+ writer.writerow(row)
+
+
input("*Press enter to close*")