Zwischenstand
This commit is contained in:
115
Sonstiges/TENTW/auswertung.py
Normal file
115
Sonstiges/TENTW/auswertung.py
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
|
||||||
|
# based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
|
||||||
|
# pip3 install openpyxl
|
||||||
|
import os
|
||||||
|
import pandas as pd # To read data
|
||||||
|
import math as m
|
||||||
|
import numpy as np
|
||||||
|
import scipy as sp
|
||||||
|
from scipy import stats
|
||||||
|
import matplotlib.pyplot as plt # To visualize
|
||||||
|
|
||||||
|
# location will help to open files in the same directory as the py-script
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
###
|
||||||
|
# Vorlesung 5
|
||||||
|
###
|
||||||
|
# ! Datensätze mit fehlenden Zellen für die HO-Spalten wurden zuvor schon in Excel rausgelöscht
|
||||||
|
|
||||||
|
df = pd.read_excel(__location__ + '/Daten_Umfrage_SPSS_20211113.xlsx')
|
||||||
|
print(df.head(10))
|
||||||
|
|
||||||
|
df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
|
||||||
|
print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
|
||||||
|
print(df.head(10))
|
||||||
|
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
|
||||||
|
# 0 NaN NaN NaN NaN NaN NaN
|
||||||
|
# 1 NaN 6.0 4.0 7.0 4.0 5.0
|
||||||
|
# ...
|
||||||
|
#df = df.dropna() # CAUTION: drops every row that even contains single NaN !
|
||||||
|
|
||||||
|
print(df.tail(10))
|
||||||
|
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
|
||||||
|
# 155 NaN 4.0 4.0 3.0 5.0 1.0
|
||||||
|
# 156 NaN NaN NaN NaN NaN NaN
|
||||||
|
# (End of File)
|
||||||
|
|
||||||
|
df = df[1:156] # Limit to row amount
|
||||||
|
|
||||||
|
# ! Vorher schon in Excel berechnet:
|
||||||
|
# ! Die Spalten "HO_Score_Bewerbung_Roh" und "SS_Score"
|
||||||
|
# ! Die Werte für die Differenz der Mittelwerte von HOx_1 und HOx_2 -> "MW", "Normiert", "Invertieren"
|
||||||
|
# ! -> auf dieser Basis auch Die Spalte "HO_Score_Bewerbung_Gewichtet"
|
||||||
|
# -> Berechnung für Diff "MW", "Normiert", "Invertieren" aber hier beispielhaft noch mal durchgeführt:
|
||||||
|
mwHO01_Diff = df["HO01_Diff"] # Limit to Column
|
||||||
|
mwHO01_Diff = mwHO01_Diff.mean(skipna=True) # Columns arithm. mean, skipna to ignore NaN rows
|
||||||
|
mwHO01_Diff = round(mwHO01_Diff, 2)
|
||||||
|
normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2) # Norm
|
||||||
|
invHO01_Diff = 1 - normHO01_Diff # invert
|
||||||
|
# usw
|
||||||
|
print("HO01_Diff Mittelwert:", mwHO01_Diff)
|
||||||
|
print("HO01_Diff Normiert:", normHO01_Diff)
|
||||||
|
print("HO01_Diff Invertiert:", invHO01_Diff)
|
||||||
|
# usw
|
||||||
|
|
||||||
|
###
|
||||||
|
# Lineare Regression für x="SS_Score" und y="HO_Score_Bewerbung_Gewichtet"
|
||||||
|
###
|
||||||
|
# Choose Dataframe Columns and Row Amount
|
||||||
|
dfColumnX = df["SS_Score"]
|
||||||
|
dfColumnY = df["HO_Score_Bewerbung_Gewichtet"]
|
||||||
|
|
||||||
|
# Convert Dataframe Columns to Numpy Array containing the X- and Y- Values
|
||||||
|
arrX = np.asarray(dfColumnX) # before: "1 16.0" after: "[16. 18. 21. ...]"
|
||||||
|
arrY = np.asarray(dfColumnY)
|
||||||
|
|
||||||
|
arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync
|
||||||
|
|
||||||
|
# Convert again, as sorting restored the DataFrame-Format instead of numpy data array
|
||||||
|
arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]"
|
||||||
|
arrY = np.asarray(arrY)
|
||||||
|
|
||||||
|
# Use least Square Linear Regression from SciPy Stats
|
||||||
|
regr_results = sp.stats.linregress(arrX, arrY)
|
||||||
|
steigung = round(regr_results.slope, 4)
|
||||||
|
yAchsAbschn = round(regr_results.intercept, 4)
|
||||||
|
arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
|
||||||
|
print("y =", steigung, "* x +", yAchsAbschn)
|
||||||
|
|
||||||
|
# Plot Linear Regression Line
|
||||||
|
plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
|
||||||
|
# Show Plot Image
|
||||||
|
plt.xlabel('SS_Score', color='black')
|
||||||
|
plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
|
||||||
|
plt.xlim([0,50]) # set x-Axis View Range,[from,to]
|
||||||
|
plt.scatter(arrX, arrY)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung
|
||||||
|
|
||||||
|
# Vorlesung 6
|
||||||
|
df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"]
|
||||||
|
df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen
|
||||||
|
|
||||||
|
df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"]
|
||||||
|
|
||||||
|
df["H0_Wert_Z_Wert"] = (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std()
|
||||||
|
|
||||||
|
df["H0_Wert_7er_Skala"] = (df["H0_Wert_Z_Wert"] * 1.5) + 4
|
||||||
|
df["H0_Wert_7er_Skala"] = round(df["H0_Wert_7er_Skala"], 2)
|
||||||
|
df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1
|
||||||
|
df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7
|
||||||
|
|
||||||
|
#df.to_csv(__location__ + '/tmpViewFile.csv', sep=";")
|
||||||
|
|
||||||
|
# Open Dataframe in Webbrowser:
|
||||||
|
def showDf(df):
|
||||||
|
import webbrowser
|
||||||
|
with open(__location__ + "/tmpViewFile.html", "w") as f:
|
||||||
|
style = '<style> tr:nth-child(odd) { background-color: lightgrey; } </style>'
|
||||||
|
dfHtml = df.to_html() + style
|
||||||
|
f.write(dfHtml)
|
||||||
|
webbrowser.open(__location__ + "/tmpViewFile.html")
|
||||||
|
showDf(df)
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
|
|
||||||
# or https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
|
|
||||||
# pip3 install openpyxl
|
|
||||||
from numpy.matrixlib import defmatrix
|
|
||||||
import os
|
|
||||||
import pandas as pd # To read data
|
|
||||||
import math as m
|
|
||||||
import numpy as np
|
|
||||||
import scipy as sp
|
|
||||||
from scipy import stats
|
|
||||||
import matplotlib.pyplot as plt # To visualize
|
|
||||||
|
|
||||||
# location will help to open files in the same directory as the py-script
|
|
||||||
__location__ = os.path.realpath(
|
|
||||||
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
|
||||||
|
|
||||||
df = pd.read_excel(os.path.join(__location__,'Daten_Umfrage_SPSS_20211113.xlsx'))
|
|
||||||
|
|
||||||
df = df.apply(pd.to_numeric, errors='coerce') # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
|
|
||||||
print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
|
|
||||||
print(df.head(10))
|
|
||||||
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
|
|
||||||
# 0 NaN NaN NaN NaN NaN NaN
|
|
||||||
# 1 NaN 6.0 4.0 7.0 4.0 5.0
|
|
||||||
# ...
|
|
||||||
#df = df.dropna() # CAUTION: drops every row that even contains single NaN !
|
|
||||||
|
|
||||||
print(df.tail(10))
|
|
||||||
# Code SE01_01 SE01_02 SE02_01 SE02_02 SE03_01
|
|
||||||
# 155 NaN 4.0 4.0 3.0 5.0 1.0
|
|
||||||
# 156 NaN NaN NaN NaN NaN NaN
|
|
||||||
# (End of File)
|
|
||||||
|
|
||||||
#print(df["HO_Score_Bewerbung_Gewichtet"][105:110])
|
|
||||||
#for col in df.columns:
|
|
||||||
#print(col)
|
|
||||||
|
|
||||||
# Calculate Mean, gew, inv
|
|
||||||
mwHO01_Diff = df["HO01_Diff"][1:156].mean(skipna=True) # skipna to ignore NaN rows
|
|
||||||
mwHO01_Diff = round(mwHO01_Diff, 2)
|
|
||||||
gewHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)
|
|
||||||
invHO01_Diff = 1 - gewHO01_Diff
|
|
||||||
# usw
|
|
||||||
print("HO01_Diff Mittelwert:", mwHO01_Diff)
|
|
||||||
print("HO01_Diff Gewichtet:", gewHO01_Diff)
|
|
||||||
print("HO01_Diff Invertiert:", invHO01_Diff)
|
|
||||||
# usw
|
|
||||||
|
|
||||||
# Limit Dataframe Column and row Amount
|
|
||||||
dfColumnX = df["SS_Score"][1:156]
|
|
||||||
dfColumnY = df["HO_Score_Bewerbung_Gewichtet"][1:156]
|
|
||||||
|
|
||||||
# Convert Dataframe Columns to Array containing the X- and Y- Values
|
|
||||||
arrX = np.asarray(dfColumnX) # convert that dataframe column to numpy array
|
|
||||||
arrY = np.asarray(dfColumnY)
|
|
||||||
|
|
||||||
# Prepare Plot Image
|
|
||||||
plt.xlabel('SS_Score', color='black')
|
|
||||||
plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
|
|
||||||
plt.xlim([0,50]) # set x-Axis View Range,[from,to]
|
|
||||||
plt.scatter(arrX, arrY)
|
|
||||||
|
|
||||||
arrX, arrY = zip(*sorted(zip(arrX,arrY))) # sort 2 arrays in sync
|
|
||||||
|
|
||||||
# Convert again, as sorting seemed break the numpy array data format
|
|
||||||
arrX = np.asarray(arrX) # before: "1 16.0" after: "[16. 18. 21. ...]"
|
|
||||||
arrY = np.asarray(arrY)
|
|
||||||
|
|
||||||
# Use least Square Linear Regression from SciPy Stats
|
|
||||||
regr_results = sp.stats.linregress(arrX, arrY)
|
|
||||||
steigung = regr_results.slope
|
|
||||||
yAchsAbschn = regr_results.intercept
|
|
||||||
arrYpredicted = steigung * arrX + yAchsAbschn # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
|
|
||||||
|
|
||||||
print("y =", steigung, "* x +", yAchsAbschn)
|
|
||||||
|
|
||||||
# Plot Linear Regression Line
|
|
||||||
plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
|
|
||||||
plt.show()
|
|
||||||
25
Vorlesung 6/autos.py
Normal file
25
Vorlesung 6/autos.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import openpyxl
|
||||||
|
from scipy import stats
|
||||||
|
|
||||||
|
# location will help to open files in the same directory as the py-script
|
||||||
|
__location__ = os.path.realpath(
|
||||||
|
os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
df = pd.read_csv(__location__ + '/autos.txt', sep=";")
|
||||||
|
df = df[:12]
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
mw = df.mean()
|
||||||
|
print(mw)
|
||||||
|
|
||||||
|
sta = df.std()
|
||||||
|
print(sta)
|
||||||
|
|
||||||
|
analysis = pd.DataFrame({"Mittelwert": mw, "Standardabw.": sta})
|
||||||
|
print(analysis)
|
||||||
|
analysis.to_excel(__location__ + "/auswertung.xlsx")
|
||||||
|
|
||||||
|
corr = stats.pearsonr(df["Weight"], df["Volume"])
|
||||||
|
print("Corr:", corr)
|
||||||
37
Vorlesung 6/autos.txt
Normal file
37
Vorlesung 6/autos.txt
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
Car;Model;Volume;Weight;CO2
|
||||||
|
Toyota;Aygo;1000;790;99
|
||||||
|
Mitsubishi;Space Star;1200;1160;95
|
||||||
|
Skoda;Citigo;1000;929;95
|
||||||
|
Fiat;500;900;865;90
|
||||||
|
Mini;Cooper;1500;1140;105
|
||||||
|
VW;Up!;1000;929;105
|
||||||
|
Skoda;Fabia;1400;1109;90
|
||||||
|
Mercedes;A-Class;1500;1365;92
|
||||||
|
Ford;Fiesta;1500;1112;98
|
||||||
|
Audi;A1;1600;1150;99
|
||||||
|
Hyundai;I20;1100;980;99
|
||||||
|
Suzuki;Swift;1300;990;101
|
||||||
|
Ford;Fiesta;1000;1112;99
|
||||||
|
Honda;Civic;1600;1252;94
|
||||||
|
Hundai;I30;1600;1326;97
|
||||||
|
Opel;Astra;1600;1330;97
|
||||||
|
BMW;1;1600;1365;99
|
||||||
|
Mazda;3;2200;1280;104
|
||||||
|
Skoda;Rapid;1600;1119;104
|
||||||
|
Ford;Focus;2000;1328;105
|
||||||
|
Ford;Mondeo;1600;1584;94
|
||||||
|
Opel;Insignia;2000;1428;99
|
||||||
|
Mercedes;C-Class;2100;1365;99
|
||||||
|
Skoda;Octavia;1600;1415;99
|
||||||
|
Volvo;S60;2000;1415;99
|
||||||
|
Mercedes;CLA;1500;1465;102
|
||||||
|
Audi;A4;2000;1490;104
|
||||||
|
Audi;A6;2000;1725;114
|
||||||
|
Volvo;V70;1600;1523;109
|
||||||
|
BMW;5;2000;1705;114
|
||||||
|
Mercedes;E-Class;2100;1605;115
|
||||||
|
Volvo;XC70;2000;1746;117
|
||||||
|
Ford;B-Max;1600;1235;104
|
||||||
|
BMW;2;1600;1390;108
|
||||||
|
Opel;Zafira;1600;1405;109
|
||||||
|
Mercedes;SLK;2500;1395;120
|
||||||
Reference in New Issue
Block a user