import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

data = pd.read_csv('Inne/SMSSpamCollection', header=None, names=["etykieta","wiadomosc"], delimiter="\t")
data['etykieta'] = data['etykieta'].replace('ham', 'nie-spam')
data.info()

X = data['wiadomosc']
y = data['etykieta']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

pipeline = Pipeline([
    ('tfidf',       TfidfVectorizer(use_idf=True)), # Konwersja tekstu na wektory TF-IDF
    ('classifier',  LinearSVC())
    #('classifier',  MultinomialNB())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Macierz konfuzji:")
df = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), index=['nie-spam','spam'], columns=['nie-spam','spam'])
print(df)
print("Accuracy score:", metrics.accuracy_score(y_test, y_pred))
print("Raport klasyfikacji dla SVM:")
print(classification_report(y_test, y_pred))

test_data = pd.read_csv('Inne/nowe-dane-SMS.csv', header=None, names=["wiadomosc"], delimiter="\t")
y2_pred = pipeline.predict(test_data['wiadomosc'])

wyniki = pd.DataFrame({'etykieta': y2_pred, 'wiadomosc': test_data['wiadomosc']})
wyniki.to_csv('Inne/SMS-wyniki2.csv', index=False, sep="\t")
print(wyniki)

