put analysis functions into own py file

author: Niclas Dobbertin <niclas.dobbertin@mailbox.org> 2023-11-01 11:20:45 +0100
committer: Niclas Dobbertin <niclas.dobbertin@mailbox.org> 2023-11-01 11:20:45 +0100
commit: 4c71eec3cd5f5f36c1cdc6d2284f6dd93facc193 (patch)
tree: 128765ac1e9cd9413323820f596ced3730091dea /experiment/analysis
parent: b8cf20494809549ad53b4664a039f9d71be7c29f (diff)
4 files changed, 127 insertions, 106 deletions
diff --git a/experiment/analysis/analysis.org b/experiment/analysis/analysis.org
index 7f6a58d..e726046 100644
--- a/experiment/analysis/analysis.org
+++ b/experiment/analysis/analysis.org
@@ -4,10 +4,11 @@
 * Imports
 #+begin_src python :results none
 import pandas as pd
-import pickle
 from pathlib import Path
 from pprint import pprint
 
+import tools
+
 #+end_src
 
 * Constants
@@ -18,13 +19,6 @@ procedures = ["1", "2", "3", "4", "5", "6", "overall"]
 #+end_src
 
 * Import Data
-#+begin_src python :results none
-def unpickle(pkl):
-    with open(pkl, "rb") as f:
-        data = pickle.load(f)
-    return data
-#+end_src
-
 ** Conditions
 #+begin_src python
 conditions = [x.stem for x in data_path.iterdir() if x.is_dir()]
@@ -35,59 +29,24 @@ conditions
 | random | fixed | blocked |
 
 ** Data
-#+begin_src python
+#+begin_src python :results none
 data = {}
 for condition in conditions:
     data[condition] = {}
     for vp in (data_path / condition).iterdir():
-        data[condition][vp.stem] = unpickle(vp / "vp.pkl")
-#+end_src
+        data[condition][vp.stem] = tools.unpickle(vp / "vp.pkl")
 
-#+RESULTS:
-: None
+data_train, data_test = tools.train_test_split(data)
+#+end_src
 
 * Basic statistics
 ** Total percent correct
 To find out how well VP solved the tasked, we calculate the accuracy for train
 and test phase.
 
-#+begin_src python :results none
-def count_correct(vp, trials):
-    trials_correct = {}
-    for proc in procedures:
-        trials_correct[proc] = 0
-    for sample in trials:
-        for proc in vp[sample]["procedure_order"]:
-            vp_ans = vp[sample][proc]["answer"]
-            for c in vp_ans:
-                if not c.isdigit():
-                    vp_ans = vp_ans.replace(c, "")
-            vp_ans = int(vp_ans)
-            if vp_ans == vp[sample]["water_sample"][proc][0]:
-                trials_correct[proc] += 1
-    return trials_correct
-#+end_src
-
-#+begin_src python :results none
-def total_accuracy(vp):
-    train_total = len(train) * len(vp[train[0]]["procedure_order"])
-    test_total = len(test) * len(vp[test[0]]["procedure_order"])
-
-    acc_train = count_correct(vp, train)
-    acc_test = count_correct(vp, test)
-
-    acc_train = sum([acc_train[x] for x in acc_train.keys()]) / train_total
-    acc_test = sum([acc_test[x] for x in acc_test.keys()]) / test_total
-
-    return acc_train, acc_test
-#+end_src
-
 #+begin_src python
-train = [x for x in vp.keys() if "train" in x]
-test = [x for x in vp.keys() if "test" in x]
-
 condition = "random"
-df = pd.DataFrame([total_accuracy(data[condition][vp]) for vp in data[condition].keys()], index=data[condition].keys(), columns=["train", "test"])
+df = pd.DataFrame([tools.total_accuracy(data[condition][vp], procedures) for vp in data[condition].keys()], index=data[condition].keys(), columns=["train", "test"])
 df
 #+end_src
 
@@ -116,7 +75,7 @@ To investigate, we look at the per procedure accuracy per subject.
 #+begin_src python
 condition = "random"
 proc_accs = [
-    count_correct(data[condition][vp], data[condition][vp].keys())
+    tools.count_correct(data[condition][vp], data[condition][vp].keys(), procedures)
     for vp in data[condition].keys()
 ]
 for vp in proc_accs:
diff --git a/experiment/analysis/analysis.pdf b/experiment/analysis/analysis.pdf
index c12a0e2..751625d 100644
--- a/experiment/analysis/analysis.pdf
+++ b/experiment/analysis/analysis.pdf
diff --git a/experiment/analysis/analysis.tex b/experiment/analysis/analysis.tex
index 2896b4f..5b56bc8 100644
--- a/experiment/analysis/analysis.tex
+++ b/experiment/analysis/analysis.tex
@@ -1,4 +1,4 @@
-% Created 2023-10-23 Mon 20:13
+% Created 2023-10-28 Sat 19:43
 % Intended LaTeX compiler: pdflatex
 \documentclass[11pt]{article}
 \usepackage[utf8]{inputenc}
@@ -30,30 +30,26 @@
 \tableofcontents
 
 \section{Imports}
-\label{sec:orgf19bf7c}
+\label{sec:orgbdc2c77}
 \begin{verbatim}
 import pandas as pd
-import pickle
 from pathlib import Path
+from pprint import pprint
+
+import tools
 
 \end{verbatim}
 \section{Constants}
-\label{sec:orgb587203}
+\label{sec:orgcb8c537}
 \begin{verbatim}
 data_path = Path("/home/niclas/repos/uni/master_thesis/experiment/data")
 
 procedures = ["1", "2", "3", "4", "5", "6", "overall"]
 \end{verbatim}
 \section{Import Data}
-\label{sec:org3427b7b}
-\begin{verbatim}
-def unpickle(pkl):
-    with open(pkl, "rb") as f:
-        data = pickle.load(f)
-    return data
-\end{verbatim}
+\label{sec:org87e67b0}
 \subsection{Conditions}
-\label{sec:org9e15909}
+\label{sec:orga12f2b6}
 \begin{verbatim}
 conditions = [x.stem for x in data_path.iterdir() if x.is_dir()]
 conditions
@@ -65,70 +61,55 @@ random & fixed & blocked\\[0pt]
 \end{tabular}
 \end{center}
 \subsection{Data}
-\label{sec:org65d4664}
+\label{sec:orgcac95cb}
 \begin{verbatim}
 data = {}
 for condition in conditions:
     data[condition] = {}
     for vp in (data_path / condition).iterdir():
-        data[condition][vp.stem] = unpickle(vp / "vp.pkl")
+        data[condition][vp.stem] = tools.unpickle(vp / "vp.pkl")
+\end{verbatim}
+
+\begin{verbatim}
+None
+\end{verbatim}
+\subsection{Useful Subdata}
+\label{sec:org4384120}
+\begin{verbatim}
+# data_correct = {conditons[0]: {}, conditons[1]: {}, conditons[2]: {}}
+pass
+# for condition in conditions:
+#     data_correct[condition] = None
 \end{verbatim}
 
 \begin{verbatim}
 None
 \end{verbatim}
 \section{Basic statistics}
-\label{sec:orgea2a5f1}
+\label{sec:org44d0851}
 \subsection{Total percent correct}
-\label{sec:org2eef721}
+\label{sec:org461b551}
 To find out how well VP solved the tasked, we calculate the accuracy for train
 and test phase.
 
 \begin{verbatim}
-def percent_correct(vp):
-    train = [x for x in vp.keys() if "train" in x]
-    test = [x for x in vp.keys() if "test" in x]
-
-    train_total = len(train) * len(vp[train[0]]["procedure_order"])
-    test_total = len(test) * len(vp[test[0]]["procedure_order"])
-
-    train_correct = 0
-    test_correct = 0
-
-    def count_correct(trials):
-        trials_correct = 0
-        for sample in trials:
-            for proc in vp[sample]["procedure_order"]:
-                vp_ans = vp[sample][proc]["answer"]
-                for c in vp_ans:
-                    if not c.isdigit():
-                        vp_ans = vp_ans.replace(c, "")
-                vp_ans = int(vp_ans)
-                if vp_ans == vp[sample]["water_sample"][proc][0]:
-                    trials_correct += 1
-        return trials_correct
-
-    return count_correct(train) / train_total, count_correct(test) / test_total
-\end{verbatim}
-
-\begin{verbatim}
 condition = "random"
-df = pd.DataFrame([percent_correct(data[condition][vp]) for vp in data[condition].keys()], columns=["train", "test"])
+df = pd.DataFrame([tools.total_accuracy(data[condition][vp], procedures) for vp in data[condition].keys()], index=data[condition].keys(), columns=["train", "test"])
 df
 \end{verbatim}
 
 \begin{verbatim}
-      train      test
-0  0.822222  0.820000
-1  0.966667  0.800000
-2  0.973333  0.980000
-3  0.911111  0.960000
-4  0.906667  0.980000
-5  0.924444  0.943333
-6  0.957778  0.926667
-7  0.857778  0.946667
-8  0.962222  0.970000
-9  0.982222  0.986667
+         train      test
+vp12  0.822222  0.820000
+vp19  0.966667  0.800000
+vp15  0.973333  0.980000
+vp17  0.911111  0.960000
+vp20  0.906667  0.980000
+vp10  0.924444  0.943333
+vp16  0.957778  0.926667
+vp13  0.857778  0.946667
+vp18  0.962222  0.970000
+vp14  0.982222  0.986667
 \end{verbatim}
 
 Most subjects have an accuracy of over 95\% in both training and test phase.
@@ -139,10 +120,31 @@ present in both, or only one of the two phases.
 To investigate, we look at the per procedure accuracy per subject.
 
 \begin{verbatim}
-pass
+condition = "random"
+proc_accs = [
+    tools.count_correct(data[condition][vp], data[condition][vp].keys(), procedures)
+    for vp in data[condition].keys()
+]
+for vp in proc_accs:
+    for proc in vp.keys():
+        vp[proc] /= len(next(iter(data[condition].values())).keys())
+df = pd.DataFrame(proc_accs, index=data[condition].keys())
+df
 \end{verbatim}
 
 \begin{verbatim}
-None
+          1      2      3      4      5      6  overall
+vp12  0.992  0.592  0.392  0.976  0.960  1.000    0.016
+vp19  1.000  0.992  0.000  0.576  0.992  0.992    0.848
+vp15  0.992  0.992  0.960  0.392  0.592  1.000    0.928
+vp17  0.392  0.968  0.584  1.000  1.000  0.992    0.648
+vp20  0.992  0.376  0.952  0.976  0.976  0.560    0.784
+vp10  0.968  0.360  0.592  0.984  0.984  0.992    0.712
+vp16  0.976  0.600  0.376  0.976  0.992  1.000    0.752
+vp13  0.384  0.960  0.928  0.560  0.992  0.968    0.568
+vp18  0.976  0.976  0.960  0.392  0.600  0.984    0.904
+vp14  0.992  0.976  0.992  0.976  0.400  0.600    0.968
 \end{verbatim}
+
+We can see that most vp have around 2 procedures with accuracy of around 50\%
 \end{document}
 \ No newline at end of file
diff --git a/experiment/analysis/tools.py b/experiment/analysis/tools.py
new file mode 100644
index 0000000..d32ccd3
--- /dev/null
+++ b/experiment/analysis/tools.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+
+import pickle
+from copy import deepcopy
+
+def unpickle(pkl):
+    with open(pkl, "rb") as f:
+        data = pickle.load(f)
+    return data
+
+
+def count_correct(vp, trials, procedures):
+    trials_correct = {}
+    for proc in procedures:
+        trials_correct[proc] = 0
+    for sample in trials:
+        for proc in vp[sample]["procedure_order"]:
+            vp_ans = vp[sample][proc]["answer"]
+            for c in vp_ans:
+                if not c.isdigit():
+                    vp_ans = vp_ans.replace(c, "")
+            vp_ans = int(vp_ans)
+            if vp_ans == vp[sample]["water_sample"][proc][0]:
+                trials_correct[proc] += 1
+    return trials_correct
+
+
+def total_accuracy(vp, procedures):
+    train = [x for x in vp.keys() if "train" in x]
+    test = [x for x in vp.keys() if "test" in x]
+
+    train_total = len(train) * len(vp[train[0]]["procedure_order"])
+    test_total = len(test) * len(vp[test[0]]["procedure_order"])
+
+    acc_train = count_correct(vp, train, procedures)
+    acc_test = count_correct(vp, test, procedures)
+
+    acc_train = sum([acc_train[x] for x in acc_train.keys()]) / train_total
+    acc_test = sum([acc_test[x] for x in acc_test.keys()]) / test_total
+
+    return acc_train, acc_test
+
+
+def train_test_split(data):
+    def delete_trials(data, string):
+        new_dict = {}
+        for cond in data.keys():
+            new_dict[cond] = {}
+            for vp in data[cond].keys():
+                new_dict[cond][vp] = {}
+                for trial in data[cond][vp].keys():
+                    if string in trial:
+                        new_dict[cond][vp][trial] = data[cond][vp][trial]
+        return new_dict
+    data_train = delete_trials(data, "train")
+    data_test = delete_trials(data, "test")
+
+    return data_train, data_test
+
+print("imported tools")
author	Niclas Dobbertin <niclas.dobbertin@mailbox.org>	2023-11-01 11:20:45 +0100
committer	Niclas Dobbertin <niclas.dobbertin@mailbox.org>	2023-11-01 11:20:45 +0100
commit	4c71eec3cd5f5f36c1cdc6d2284f6dd93facc193 (patch)
tree	128765ac1e9cd9413323820f596ced3730091dea /experiment/analysis
parent	b8cf20494809549ad53b4664a039f9d71be7c29f (diff)