# https://towardsdatascience.com/five-regression-python-modules-that-every-data-scientist-must-know-a4e03a886853
#   based on: https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/
# pip3 install openpyxl
import os
import pandas as pd  # To read data
import math as m
import numpy as np
import scipy as sp 
from scipy import stats
import matplotlib.pyplot as plt  # To visualize

# location will help to open files in the same directory as the py-script
__location__ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))

###
# Vorlesung 5
###
# ! Datensätze mit fehlenden Zellen für die HO-Spalten wurden zuvor schon in Excel rausgelöscht

df = pd.read_excel(__location__ + '/Daten_Umfrage_SPSS_20211113.xlsx')
print(df.head(10))

df = df.apply(pd.to_numeric, errors='coerce')          # convert non-numeric values to NaN, e.g. Header "row 1" "CodeXYZ" -> "row 1" "NaN"
print("Dataframe (Zeilen, Spalten, ...) inkl. NaN:", df.shape)
print(df.head(10))
#        Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01 
# 0      NaN      NaN      NaN      NaN      NaN      NaN   
# 1      NaN      6.0      4.0      7.0      4.0      5.0   
# ...
#df = df.dropna()                                   # CAUTION: drops every row that even contains single NaN !

print(df.tail(10))
#          Code     SE01_01  SE01_02  SE02_01  SE02_02  SE03_01  
# 155      NaN      4.0      4.0      3.0      5.0      1.0  
# 156      NaN      NaN      NaN      NaN      NaN      NaN 
# (End of File)

df = df[1:156]                                  # Limit to row amount

# ! Vorher schon in Excel berechnet:
# ! Die Spalten "HO_Score_Bewerbung_Roh" und "SS_Score" 
# ! Die Werte für die Differenz der Mittelwerte von HOx_1 und HOx_2 -> "MW", "Normiert",  "Invertieren"
# !   -> auf dieser Basis auch Die Spalte "HO_Score_Bewerbung_Gewichtet" 
#     -> Berechnung für Diff "MW", "Normiert",  "Invertieren" aber hier beispielhaft noch mal durchgeführt:
mwHO01_Diff = df["HO01_Diff"]                   # Limit to Column
mwHO01_Diff = mwHO01_Diff.mean(skipna=True)     # Columns arithm. mean, skipna to ignore NaN rows
mwHO01_Diff = round(mwHO01_Diff, 2)
normHO01_Diff = m.sqrt((mwHO01_Diff / 6)**2)    # Norm
invHO01_Diff = 1 - normHO01_Diff                # invert
# usw
print("HO01_Diff Mittelwert:", mwHO01_Diff)
print("HO01_Diff Normiert:", normHO01_Diff)
print("HO01_Diff Invertiert:", invHO01_Diff)
# usw

###
# Lineare Regression für x="SS_Score" und y="HO_Score_Bewerbung_Gewichtet"
###
# Choose Dataframe Columns and Row Amount
dfColumnX = df["SS_Score"]
dfColumnY = df["HO_Score_Bewerbung_Gewichtet"]

# Convert Dataframe Columns to Numpy Array containing the X- and Y- Values
arrX = np.asarray(dfColumnX)                         # before: "1   16.0" after: "[16. 18. 21. ...]"
arrY = np.asarray(dfColumnY)                         

arrX, arrY = zip(*sorted(zip(arrX,arrY)))            # sort 2 arrays in sync 

# Convert again, as sorting restored the DataFrame-Format instead of numpy data array
arrX = np.asarray(arrX)                              # before: "1   16.0" after: "[16. 18. 21. ...]"
arrY = np.asarray(arrY)

# Use least Square Linear Regression from SciPy Stats
regr_results = sp.stats.linregress(arrX, arrY)   
steigung = round(regr_results.slope, 4)
yAchsAbschn = round(regr_results.intercept, 4)
arrYpredicted = steigung * arrX + yAchsAbschn        # using y = m*x + n, calculate every single Y-Value fitting the regression Lines X-Values
print("y =", steigung, "* x +", yAchsAbschn)

# Plot Linear Regression Line
plt.plot(arrX, arrYpredicted, label='Lin Regr', color='red', linestyle='solid') # https://scriptverse.academy/tutorials/python-matplotlib-plot-straight-line.html
# Show Plot Image
plt.xlabel('SS_Score', color='black')
plt.ylabel('HO_Score_Bewerbung_Gewichtet', color='black')
plt.xlim([0,50])                                     # set x-Axis View Range,[from,to]
plt.scatter(arrX, arrY)
plt.show()

df["HO_Score_erklaert"] = yAchsAbschn + df["SS_Score"] * steigung

# Vorlesung 6
df["Diff_roh_erklaert"] = df["HO_Score_Bewerbung_Roh"] - df["HO_Score_erklaert"] 
print("Diff_roh_erklaert_Mean", df["Diff_roh_erklaert"].mean())
df.loc[df.Diff_roh_erklaert < 0, "Diff_roh_erklaert"] = 0 # Differenz soll minimal 0 sein dürfen

df["H0_Final"] = df["HO_Score_Bewerbung_Gewichtet"] + df["Diff_roh_erklaert"]

df["H0_Wert_Z_Wert"] =  (df["H0_Final"] - df["H0_Final"].mean()) / df["H0_Final"].std()
print("HO_Final mean", df["H0_Final"].mean(), "HO_Final std:", df["H0_Final"].std())
print("SS_Mean", df["SS_Score"].mean())

df["H0_Wert_7er_Skala"] =  (df["H0_Wert_Z_Wert"] * 1.5) + 4
df["H0_Wert_7er_Skala"] =  round(df["H0_Wert_7er_Skala"], 2)
df.loc[df.H0_Wert_7er_Skala < 1, "H0_Wert_7er_Skala"] = 1 # ausreisser festlegen auf Min 1
df.loc[df.H0_Wert_7er_Skala > 7, "H0_Wert_7er_Skala"] = 7 # ausreisser festelegen auf Max 7

df.to_csv(__location__ + '/tmpViewFile.csv', sep=";")

# Open Dataframe in Webbrowser:
def showDf(df):
    import webbrowser 
    with open(__location__ + "/tmpViewFile.html", "w") as f:   
        style = '<style> tr:nth-child(odd) {  background-color: lightgrey; } </style>'
        dfHtml = df.to_html() + style
        f.write(dfHtml)
    webbrowser.open(__location__ + "/tmpViewFile.html")
showDf(df)