diff --git a/Sonstiges/TENTW/auswertung.py b/Sonstiges/TENTW/auswertung.py new file mode 100644 index 0000000..b219e42 --- /dev/null +++ b/Sonstiges/TENTW/auswertung.py @@ -0,0 +1,115 @@ +# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853 +# based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/ +# pip3 install openpyxl +import os +import pandas as pd # To read data +import math as m +import numpy as np +import scipy as sp +from scipy import stats +import matplotlib.pyplot as plt # To visualize + +# location will help to open files in the same directory as the py-script +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + +### +# Vorlesung 5 +### +# ! Datensätze mit fehlenden Zellen für die HO-Spalten wurden zuvor schon in Excel rausgelöscht + +df = pd.read_excel(__location__ + '/Daten_Umfrage_SPSS_20211113.xlsx') +print(df.head(10)) + +df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN" +print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape) +print(df.head(10)) +# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 +# 0 NaN NaN NaN NaN NaN NaN +# 1 NaN 6.0 4.0 7.0 4.0 5.0 +# ... +#df = df.dropna() # CAUTION: drops every row that even contains single NaN ! + +print(df.tail(10)) +# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 +# 155 NaN 4.0 4.0 3.0 5.0 1.0 +# 156 NaN NaN NaN NaN NaN NaN +# (End of File) + +df = df[1:156] # Limit to row amount + +# ! Vorher schon in Excel berechnet: +# ! Die Spalten "HO_Score_Bewerbung_Roh" und "SS_Score" +# ! Die Werte für die Differenz der Mittelwerte von HOx_1 und HOx_2 -> "MW", "Normiert", "Invertieren" +# ! -> auf dieser Basis auch Die Spalte "HO_Score_Bewerbung_Gewichtet" +# -> Berechnung für Diff "MW", "Normiert", "Invertieren" aber hier beispielhaft noch mal durchgeführt: +mwHO01_Diff = df["HO01_Diff"] # Limit to Column +mwHO01_Diff = mwHO01_Diff.mean(skipna=True) # Columns arithm. mean, skipna to ignore NaN rows +mwHO01_Diff = round(mwHO01_Diff, 2) +normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) # Norm +invHO01_Diff = 1 - normHO01_Diff # invert +# usw +print("HO01_Diff Mittelwert:", mwHO01_Diff) +print("HO01_Diff Normiert:", normHO01_Diff) +print("HO01_Diff Invertiert:", invHO01_Diff) +# usw + +### +# Lineare Regression für x="SS_Score" und y="HO_Score_Bewerbung_Gewichtet" +### +# Choose Dataframe Columns and Row Amount +dfColumnX = df["SS_Score"] +dfColumnY = df["HO_Score_Bewerbung_Gewichtet"] + +# Convert Dataframe Columns to Numpy Array containing the X- and Y- Values +arrX = np.asarray(dfColumnX) # before: "1 16.0" after: "[16. 18. 21. ...]" +arrY = np.asarray(dfColumnY) + +arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync + +# Convert again, as sorting restored the DataFrame-Format instead of numpy data array +arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]" +arrY = np.asarray(arrY) + +# Use least Square Linear Regression from SciPy Stats +regr_results = sp.stats.linregress(arrX, arrY) +steigung = round(regr_results.slope, 4) +yAchsAbschn = round(regr_results.intercept, 4) +arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values +print("y =", steigung, "* x +", yAchsAbschn) + +# Plot Linear Regression Line +plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html +# Show Plot Image +plt.xlabel('SS_Score', color='black') +plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black') +plt.xlim([0,50]) # set x-Axis View Range,[from,to] +plt.scatter(arrX, arrY) +plt.show() + +df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung + +# Vorlesung 6 +df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] +df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen + +df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"] + +df["H0_Wert_Z_Wert"] = (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std() + +df["H0_Wert_7er_Skala"] = (df["H0_Wert_Z_Wert"] * 1.5) + 4 +df["H0_Wert_7er_Skala"] = round(df["H0_Wert_7er_Skala"], 2) +df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1 +df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7 + +#df.to_csv(__location__ + '/tmpViewFile.csv', sep=";") + +# Open Dataframe in Webbrowser: +def showDf(df): + import webbrowser + with open(__location__ + "/tmpViewFile.html", "w") as f: + style = '' + dfHtml = df.to_html() + style + f.write(dfHtml) + webbrowser.open(__location__ + "/tmpViewFile.html") +showDf(df) \ No newline at end of file diff --git a/Sonstiges/TENTW/linearregression.py b/Sonstiges/TENTW/linearregression.py deleted file mode 100644 index 3a366ec..0000000 --- a/Sonstiges/TENTW/linearregression.py +++ /dev/null @@ -1,79 +0,0 @@ -# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853 -# or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/ -# pip3 install openpyxl -from numpy.matrixlib import defmatrix -import os -import pandas as pd # To read data -import math as m -import numpy as np -import scipy as sp -from scipy import stats -import matplotlib.pyplot as plt # To visualize - -# location will help to open files in the same directory as the py-script -__location__ = os.path.realpath( - os.path.join(os.getcwd(), os.path.dirname(__file__))) - -df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx')) - -df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN" -print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape) -print(df.head(10)) -# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 -# 0 NaN NaN NaN NaN NaN NaN -# 1 NaN 6.0 4.0 7.0 4.0 5.0 -# ... -#df = df.dropna() # CAUTION: drops every row that even contains single NaN ! - -print(df.tail(10)) -# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01 -# 155 NaN 4.0 4.0 3.0 5.0 1.0 -# 156 NaN NaN NaN NaN NaN NaN -# (End of File) - -#print(df["HO_Score_Bewerbung_Gewichtet"][105:110]) -#for col in df.columns: - #print(col) - -# Calculate Mean, gew, inv -mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True) # skipna to ignore NaN rows -mwHO01_Diff = round(mwHO01_Diff, 2) -gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) -invHO01_Diff = 1 - gewHO01_Diff -# usw -print("HO01_Diff Mittelwert:", mwHO01_Diff) -print("HO01_Diff Gewichtet:", gewHO01_Diff) -print("HO01_Diff Invertiert:", invHO01_Diff) -# usw - -# Limit Dataframe Column and row Amount -dfColumnX = df["SS_Score"][1:156] -dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156] - -# Convert Dataframe Columns to Array containing the X- and Y- Values -arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array -arrY = np.asarray(dfColumnY) - -# Prepare Plot Image -plt.xlabel('SS_Score', color='black') -plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black') -plt.xlim([0,50]) # set x-Axis View Range,[from,to] -plt.scatter(arrX, arrY) - -arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync - -# Convert again, as sorting seemed break the numpy array data format -arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]" -arrY = np.asarray(arrY) - -# Use least Square Linear Regression from SciPy Stats -regr_results = sp.stats.linregress(arrX, arrY) -steigung = regr_results.slope -yAchsAbschn = regr_results.intercept -arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values - -print("y =", steigung, "* x +", yAchsAbschn) - -# Plot Linear Regression Line -plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html -plt.show() \ No newline at end of file diff --git a/Vorlesung 6/autos.py b/Vorlesung 6/autos.py new file mode 100644 index 0000000..d90ed4a --- /dev/null +++ b/Vorlesung 6/autos.py @@ -0,0 +1,25 @@ +import os +import pandas as pd +import openpyxl +from scipy import stats + +# location will help to open files in the same directory as the py-script +__location__ = os.path.realpath( + os.path.join(os.getcwd(), os.path.dirname(__file__))) + +df = pd.read_csv(__location__ + '/autos.txt', sep=";") +df = df[:12] +print(df) + +mw = df.mean() +print(mw) + +sta = df.std() +print(sta) + +analysis = pd.DataFrame({"Mittelwert": mw, "Standardabw.": sta}) +print(analysis) +analysis.to_excel(__location__ + "/auswertung.xlsx") + +corr = stats.pearsonr(df["Weight"], df["Volume"]) +print("Corr:", corr) \ No newline at end of file diff --git a/Vorlesung 6/autos.txt b/Vorlesung 6/autos.txt new file mode 100644 index 0000000..ab4a201 --- /dev/null +++ b/Vorlesung 6/autos.txt @@ -0,0 +1,37 @@ +Car;Model;Volume;Weight;CO2 +Toyota;Aygo;1000;790;99 +Mitsubishi;Space Star;1200;1160;95 +Skoda;Citigo;1000;929;95 +Fiat;500;900;865;90 +Mini;Cooper;1500;1140;105 +VW;Up!;1000;929;105 +Skoda;Fabia;1400;1109;90 +Mercedes;A-Class;1500;1365;92 +Ford;Fiesta;1500;1112;98 +Audi;A1;1600;1150;99 +Hyundai;I20;1100;980;99 +Suzuki;Swift;1300;990;101 +Ford;Fiesta;1000;1112;99 +Honda;Civic;1600;1252;94 +Hundai;I30;1600;1326;97 +Opel;Astra;1600;1330;97 +BMW;1;1600;1365;99 +Mazda;3;2200;1280;104 +Skoda;Rapid;1600;1119;104 +Ford;Focus;2000;1328;105 +Ford;Mondeo;1600;1584;94 +Opel;Insignia;2000;1428;99 +Mercedes;C-Class;2100;1365;99 +Skoda;Octavia;1600;1415;99 +Volvo;S60;2000;1415;99 +Mercedes;CLA;1500;1465;102 +Audi;A4;2000;1490;104 +Audi;A6;2000;1725;114 +Volvo;V70;1600;1523;109 +BMW;5;2000;1705;114 +Mercedes;E-Class;2100;1605;115 +Volvo;XC70;2000;1746;117 +Ford;B-Max;1600;1235;104 +BMW;2;1600;1390;108 +Opel;Zafira;1600;1405;109 +Mercedes;SLK;2500;1395;120