# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853 # or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/ # pip3 install openpyxl from numpy.matrixlib import defmatrix import os import pandas as pd # To read data import math as m import numpy as np import scipy as sp from scipy import stats import matplotlib.pyplot as plt # To visualize # location will help to open files in the same directory as the py-script __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx')) df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN" print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape) print(df.head(10)) # Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 # 0 NaN NaN NaN NaN NaN NaN # 1 NaN 6.0 4.0 7.0 4.0 5.0 # ... #df = df.dropna() # CAUTION: drops every row that even contains single NaN ! print(df.tail(10)) # Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 # 155 NaN 4.0 4.0 3.0 5.0 1.0 # 156 NaN NaN NaN NaN NaN NaN # (End of File) #print(df["HO_Score_Bewerbung_Gewichtet"][105:110]) #for col in df.columns: #print(col) # Calculate Mean, gew, inv mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True) # skipna to ignore NaN rows mwHO01_Diff = round(mwHO01_Diff, 2) gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) invHO01_Diff = 1 - gewHO01_Diff # usw print("HO01_Diff Mittelwert:", mwHO01_Diff) print("HO01_Diff Gewichtet:", gewHO01_Diff) print("HO01_Diff Invertiert:", invHO01_Diff) # usw # Limit Dataframe Column and row Amount dfColumnX = df["SS_Score"][1:156] dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156] # Convert Dataframe Columns to Array containing the X- and Y- Values arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array arrY = np.asarray(dfColumnY) # Prepare Plot Image plt.xlabel('SS_Score', color='black') plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black') plt.xlim([0,50]) # set x-Axis View Range,[from,to] plt.scatter(arrX, arrY) arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync # Convert again, as sorting seemed break the numpy array data format arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]" arrY = np.asarray(arrY) # Use least Square Linear Regression from SciPy Stats regr_results = sp.stats.linregress(arrX, arrY) steigung = regr_results.slope yAchsAbschn = regr_results.intercept arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values print("y =", steigung, "* x +", yAchsAbschn) # Plot Linear Regression Line plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html plt.show()