Code fuer Hausarbeit a-c,e

2022-01-22 17:08:15 +01:00
parent b3e4a699e4
commit 14f7b483f9
7 changed files with 416 additions and 2 deletions
--- a/Hausarbeit/Beispielcode
+++ b/Hausarbeit/Beispielcode
@@ -0,0 +1,84 @@
+```
+# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
+#   based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
+
+# pip3 install openpyxl
+import os
+import pandas as pd  # To read data
+import math as m
+import numpy as np
+import scipy as sp 
+
+from scipy import stats
+import matplotlib.pyplot as plt  # To visualize
+
+# location will help to open files in the same directory as the py-script
+__location__ = os.path.realpath(
+    os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
+
+df = df.apply(pd.to_numeric, errors='coerce')          # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
+print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
+print(df.head(10))
+#        Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01 
+# 0      NaN      NaN      NaN      NaN      NaN      NaN   
+# 1      NaN      6.0      4.0      7.0      4.0      5.0   
+# ...
+#df = df.dropna()                                   # CAUTION: drops every row that even contains single NaN !
+
+print(df.tail(10))
+#          Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01  
+# 155      NaN      4.0      4.0      3.0      5.0      1.0  
+# 156      NaN      NaN      NaN      NaN      NaN      NaN 
+# (End of File)
+
+#print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
+#for col in df.columns:
+        #print(col)
+
+# Calculate Mean, gew, inv
+mwHO01_Diff = df["HO01_Diff"][1:156]            # Limit to Column and row Amount
+mwHO01_Diff = mwHO01_Diff.mean(skipna=True)     # Columns arithm. mean, skipna to ignore NaN rows
+mwHO01_Diff = round(mwHO01_Diff, 2)
+normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)    # Norm
+invHO01_Diff = 1 - normHO01_Diff                # invert
+# usw
+print("HO01_Diff Mittelwert:", mwHO01_Diff)
+print("HO01_Diff Normiert:", normHO01_Diff)
+print("HO01_Diff Invertiert:", invHO01_Diff)
+# usw
+
+# Choose Dataframe Columns and row Amount
+dfColumnX = df["SS_Score"][1:156]
+dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
+
+# Convert Dataframe Columns to Array containing the X- and Y- Values
+arrX = np.asarray(dfColumnX)                         # convert that dataframe column to numpy array
+arrY = np.asarray(dfColumnY)                         
+
+# Prepare Plot Image
+plt.xlabel('SS_Score', color='black')
+plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
+plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
+plt.scatter(arrX, arrY)
+
+arrX, arrY = zip(*sorted(zip(arrX,arrY)))            # sort 2 arrays in sync 
+
+# Convert again, as sorting seemed to break the numpy array data format
+arrX = np.asarray(arrX)                              # before: "1   16.0" after: "[16. 18. 21. ...]"
+arrY = np.asarray(arrY)
+
+# Use least Square Linear Regression from SciPy Stats
+regr_results = sp.stats.linregress(arrX, arrY)   
+
+steigung = regr_results.slope
+yAchsAbschn = regr_results.intercept
+arrYpredicted = steigung * arrX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
+
+print("y =", steigung, "* x +", yAchsAbschn)
+
+# Plot Linear Regression Line
+plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
+plt.show()
+```