# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853 # based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/ # pip3 install openpyxl import os import pandas as pd # To read data import math as m import numpy as np import scipy as sp from scipy import stats import matplotlib.pyplot as plt # To visualize # location will help to open files in the same directory as the py-script __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) ### # Vorlesung 5 ### # ! Datensätze mit fehlenden Zellen für die HO-Spalten wurden zuvor schon in Excel rausgelöscht df = pd.read_excel(__location__ + '/Daten_Umfrage_SPSS_20211113.xlsx') print(df.head(10)) df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN" print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape) print(df.head(10)) # Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 # 0 NaN NaN NaN NaN NaN NaN # 1 NaN 6.0 4.0 7.0 4.0 5.0 # ... #df = df.dropna() # CAUTION: drops every row that even contains single NaN ! print(df.tail(10)) # Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 # 155 NaN 4.0 4.0 3.0 5.0 1.0 # 156 NaN NaN NaN NaN NaN NaN # (End of File) df = df[1:156] # Limit to row amount # ! Vorher schon in Excel berechnet: # ! Die Spalten "HO_Score_Bewerbung_Roh" und "SS_Score" # ! Die Werte für die Differenz der Mittelwerte von HOx_1 und HOx_2 -> "MW", "Normiert", "Invertieren" # ! -> auf dieser Basis auch Die Spalte "HO_Score_Bewerbung_Gewichtet" # -> Berechnung für Diff "MW", "Normiert", "Invertieren" aber hier beispielhaft noch mal durchgeführt: mwHO01_Diff = df["HO01_Diff"] # Limit to Column mwHO01_Diff = mwHO01_Diff.mean(skipna=True) # Columns arithm. mean, skipna to ignore NaN rows mwHO01_Diff = round(mwHO01_Diff, 2) normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) # Norm invHO01_Diff = 1 - normHO01_Diff # invert # usw print("HO01_Diff Mittelwert:", mwHO01_Diff) print("HO01_Diff Normiert:", normHO01_Diff) print("HO01_Diff Invertiert:", invHO01_Diff) # usw ### # Lineare Regression für x="SS_Score" und y="HO_Score_Bewerbung_Gewichtet" ### # Choose Dataframe Columns and Row Amount dfColumnX = df["SS_Score"] dfColumnY = df["HO_Score_Bewerbung_Gewichtet"] # Convert Dataframe Columns to Numpy Array containing the X- and Y- Values arrX = np.asarray(dfColumnX) # before: "1 16.0" after: "[16. 18. 21. ...]" arrY = np.asarray(dfColumnY) arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync # Convert again, as sorting restored the DataFrame-Format instead of numpy data array arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]" arrY = np.asarray(arrY) # Use least Square Linear Regression from SciPy Stats regr_results = sp.stats.linregress(arrX, arrY) steigung = round(regr_results.slope, 4) yAchsAbschn = round(regr_results.intercept, 4) arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values print("y =", steigung, "* x +", yAchsAbschn) # Plot Linear Regression Line plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html # Show Plot Image plt.xlabel('SS_Score', color='black') plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black') plt.xlim([0,50]) # set x-Axis View Range,[from,to] plt.scatter(arrX, arrY) plt.show() df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung # Vorlesung 6 df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] print("Diff_roh_erklaert_Mean", df["Diff_roh_erklaert"].mean()) df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"] df["H0_Wert_Z_Wert"] = (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std() print("HO_Final mean", df["H0_Final"].mean(), "HO_Final std:", df["H0_Final"].std()) print("SS_Mean", df["SS_Score"].mean()) df["H0_Wert_7er_Skala"] = (df["H0_Wert_Z_Wert"] * 1.5) + 4 df["H0_Wert_7er_Skala"] = round(df["H0_Wert_7er_Skala"], 2) df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1 df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7 df.to_csv(__location__ + '/tmpViewFile.csv', sep=";") # Open Dataframe in Webbrowser: def showDf(df): import webbrowser with open(__location__ + "/tmpViewFile.html", "w") as f: style = '' dfHtml = df.to_html() + style f.write(dfHtml) webbrowser.open(__location__ + "/tmpViewFile.html") showDf(df)