Zwischenstand

2021-11-27 14:20:55 +01:00
parent ea08ba9b18
commit b3e4a699e4
4 changed files with 177 additions and 79 deletions
--- a/Sonstiges/TENTW/auswertung.py
+++ b/Sonstiges/TENTW/auswertung.py
@@ -0,0 +1,115 @@
 # https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
 #   based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
 # pip3 install openpyxl
 import os
 import pandas as pd  # To read data
 import math as m
 import numpy as np
 import scipy as sp 
 from scipy import stats
 import matplotlib.pyplot as plt  # To visualize
 # location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 ###
 # Vorlesung 5
 ###
 # ! Datensätze mit fehlenden Zellen für die HO-Spalten wurden zuvor schon in Excel rausgelöscht
 df = pd.read_excel(__location__ + '/Daten_Umfrage_SPSS_20211113.xlsx')
 print(df.head(10))
 df = df.apply(pd.to_numeric, errors='coerce')          # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
 print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
 print(df.head(10))
 #        Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01 
 # 0      NaN      NaN      NaN      NaN      NaN      NaN   
 # 1      NaN      6.0      4.0      7.0      4.0      5.0   
 # ...
 #df = df.dropna()                                   # CAUTION: drops every row that even contains single NaN !
 print(df.tail(10))
 #          Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01  
 # 155      NaN      4.0      4.0      3.0      5.0      1.0  
 # 156      NaN      NaN      NaN      NaN      NaN      NaN 
 # (End of File)
 df = df[1:156]                                  # Limit to row amount
 # ! Vorher schon in Excel berechnet:
 # ! Die Spalten "HO_Score_Bewerbung_Roh" und "SS_Score" 
 # ! Die Werte für die Differenz der Mittelwerte von HOx_1 und HOx_2 -> "MW", "Normiert",  "Invertieren"
 # !   -> auf dieser Basis auch Die Spalte "HO_Score_Bewerbung_Gewichtet" 
 #     -> Berechnung für Diff "MW", "Normiert",  "Invertieren" aber hier beispielhaft noch mal durchgeführt:
 mwHO01_Diff = df["HO01_Diff"]                   # Limit to Column
 mwHO01_Diff = mwHO01_Diff.mean(skipna=True)     # Columns arithm. mean, skipna to ignore NaN rows
 mwHO01_Diff = round(mwHO01_Diff, 2)
 normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)    # Norm
 invHO01_Diff = 1 - normHO01_Diff                # invert
 # usw
 print("HO01_Diff Mittelwert:", mwHO01_Diff)
 print("HO01_Diff Normiert:", normHO01_Diff)
 print("HO01_Diff Invertiert:", invHO01_Diff)
 # usw
 ###
 # Lineare Regression für x="SS_Score" und y="HO_Score_Bewerbung_Gewichtet"
 ###
 # Choose Dataframe Columns and Row Amount
 dfColumnX = df["SS_Score"]
 dfColumnY = df["HO_Score_Bewerbung_Gewichtet"]
 # Convert Dataframe Columns to Numpy Array containing the X- and Y- Values
 arrX = np.asarray(dfColumnX)                         # before: "1   16.0" after: "[16. 18. 21. ...]"
 arrY = np.asarray(dfColumnY)                         
 arrX, arrY = zip(*sorted(zip(arrX,arrY)))            # sort 2 arrays in sync 
 # Convert again, as sorting restored the DataFrame-Format instead of numpy data array
 arrX = np.asarray(arrX)                              # before: "1   16.0" after: "[16. 18. 21. ...]"
 arrY = np.asarray(arrY)
 # Use least Square Linear Regression from SciPy Stats
 regr_results = sp.stats.linregress(arrX, arrY)   
 steigung = round(regr_results.slope, 4)
 yAchsAbschn = round(regr_results.intercept, 4)
 arrYpredicted = steigung * arrX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
 print("y =", steigung, "* x +", yAchsAbschn)
 # Plot Linear Regression Line
 plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
 # Show Plot Image
 plt.xlabel('SS_Score', color='black')
 plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
 plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
 plt.scatter(arrX, arrY)
 plt.show()
 df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung
 # Vorlesung 6
 df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] 
 df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen
 df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"]
 df["H0_Wert_Z_Wert"] =  (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std()
 df["H0_Wert_7er_Skala"] =  (df["H0_Wert_Z_Wert"] * 1.5) + 4
 df["H0_Wert_7er_Skala"] =  round(df["H0_Wert_7er_Skala"], 2)
 df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1
 df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7
 #df.to_csv(__location__ + '/tmpViewFile.csv', sep=";")
 # Open Dataframe in Webbrowser:
 def showDf(df):
    import webbrowser 
    with open(__location__ + "/tmpViewFile.html", "w") as f:   
        style = '<style> tr:nth-child(odd) {  background-color: lightgrey; } </style>'
        dfHtml = df.to_html() + style
        f.write(dfHtml)
    webbrowser.open(__location__ + "/tmpViewFile.html")
 showDf(df)
--- a/Sonstiges/TENTW/linearregression.py
+++ b/Sonstiges/TENTW/linearregression.py
@@ -1,79 +0,0 @@
 # https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
 #   or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
 # pip3 install openpyxl
 from numpy.matrixlib import defmatrix
 import os
 import pandas as pd  # To read data
 import math as m
 import numpy as np
 import scipy as sp 
 from scipy import stats
 import matplotlib.pyplot as plt  # To visualize
 # location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
 df = df.apply(pd.to_numeric, errors='coerce')          # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
 print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
 print(df.head(10))
 #        Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01 
 # 0      NaN      NaN      NaN      NaN      NaN      NaN   
 # 1      NaN      6.0      4.0      7.0      4.0      5.0   
 # ...
 #df = df.dropna()                                   # CAUTION: drops every row that even contains single NaN !
 print(df.tail(10))
 #          Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01  
 # 155      NaN      4.0      4.0      3.0      5.0      1.0  
 # 156      NaN      NaN      NaN      NaN      NaN      NaN 
 # (End of File)
 #print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
 #for col in df.columns:
        #print(col)
 # Calculate Mean, gew, inv
 mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True)     # skipna to ignore NaN rows
 mwHO01_Diff = round(mwHO01_Diff, 2)
 gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)
 invHO01_Diff = 1 - gewHO01_Diff
 # usw
 print("HO01_Diff Mittelwert:", mwHO01_Diff)
 print("HO01_Diff Gewichtet:", gewHO01_Diff)
 print("HO01_Diff Invertiert:", invHO01_Diff)
 # usw
 # Limit Dataframe Column and row Amount
 dfColumnX = df["SS_Score"][1:156]
 dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
 # Convert Dataframe Columns to Array containing the X- and Y- Values
 arrX = np.asarray(dfColumnX)                         # convert that dataframe column to numpy array
 arrY = np.asarray(dfColumnY)                         
 # Prepare Plot Image
 plt.xlabel('SS_Score', color='black')
 plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
 plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
 plt.scatter(arrX, arrY)
 arrX, arrY = zip(*sorted(zip(arrX,arrY)))            # sort 2 arrays in sync 
 # Convert again, as sorting seemed break the numpy array data format
 arrX = np.asarray(arrX)                              # before: "1   16.0" after: "[16. 18. 21. ...]"
 arrY = np.asarray(arrY)
 # Use least Square Linear Regression from SciPy Stats
 regr_results = sp.stats.linregress(arrX, arrY)   
 steigung = regr_results.slope
 yAchsAbschn = regr_results.intercept
 arrYpredicted = steigung * arrX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
 print("y =", steigung, "* x +", yAchsAbschn)
 # Plot Linear Regression Line
 plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
 plt.show()
--- a/6/autos.py
+++ b/6/autos.py
@@ -0,0 +1,25 @@
 import os
 import pandas as pd
 import openpyxl
 from scipy import stats
 # location will help to open files in the same directory as the py-script
 __location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))
 df = pd.read_csv(__location__ + '/autos.txt', sep=";")
 df = df[:12]
 print(df)
 mw = df.mean()
 print(mw)
 sta = df.std()
 print(sta)
 analysis = pd.DataFrame({"Mittelwert": mw, "Standardabw.": sta})
 print(analysis)
 analysis.to_excel(__location__ + "/auswertung.xlsx")
 corr = stats.pearsonr(df["Weight"], df["Volume"])
 print("Corr:", corr)
--- a/6/autos.txt
+++ b/6/autos.txt
@@ -0,0 +1,37 @@
 Car;Model;Volume;Weight;CO2
 Toyota;Aygo;1000;790;99
 Mitsubishi;Space Star;1200;1160;95
 Skoda;Citigo;1000;929;95
 Fiat;500;900;865;90
 Mini;Cooper;1500;1140;105
 VW;Up!;1000;929;105
 Skoda;Fabia;1400;1109;90
 Mercedes;A-Class;1500;1365;92
 Ford;Fiesta;1500;1112;98
 Audi;A1;1600;1150;99
 Hyundai;I20;1100;980;99
 Suzuki;Swift;1300;990;101
 Ford;Fiesta;1000;1112;99
 Honda;Civic;1600;1252;94
 Hundai;I30;1600;1326;97
 Opel;Astra;1600;1330;97
 BMW;1;1600;1365;99
 Mazda;3;2200;1280;104
 Skoda;Rapid;1600;1119;104
 Ford;Focus;2000;1328;105
 Ford;Mondeo;1600;1584;94
 Opel;Insignia;2000;1428;99
 Mercedes;C-Class;2100;1365;99
 Skoda;Octavia;1600;1415;99
 Volvo;S60;2000;1415;99
 Mercedes;CLA;1500;1465;102
 Audi;A4;2000;1490;104
 Audi;A6;2000;1725;114
 Volvo;V70;1600;1523;109
 BMW;5;2000;1705;114
 Mercedes;E-Class;2100;1605;115
 Volvo;XC70;2000;1746;117
 Ford;B-Max;1600;1235;104
 BMW;2;1600;1390;108
 Opel;Zafira;1600;1405;109
 Mercedes;SLK;2500;1395;120