Titanic survival Prediction (Data Reading, processing and Modeling)

  • Brief introduction:

This article is a survival prediction on the Titanic, based on a classic competition on Kaggle.

Data set:

1.Kaggle Titanic project page download data: www.kaggle.com/c/titanic

2. The network location address: pan.baidu.com/s/1BfRZdCz6… Extraction code: JZB3

  • Content of the code

Data reading:

# % %
import tensorflow as tf
import keras
import pandas as pd
import numpy as np

data = pd.read_csv("titanic/train.csv")
print(data.head())
print(data.describe())
Copy the code

Data processing:

# % %
strs = "Survived Pclass Sex Age SibSp Parch Fare Embarked"
clos = strs.split("")
print(clos)
# % %
x_datas = data[clos]
print(x_datas.head())
# % %
print(x_datas.isnull().sum())

# % %
x_datas["Age"] = x_datas["Age"].fillna(x_datas["Age"].mean())
x_datas["Embarked"] = x_datas["Embarked"].fillna(x_datas["Embarked"].mode()[0])


#x_datas["Sex"] = pd.get_dummies(x_datas["Sex"])
x_datas = pd.get_dummies(x_datas,columns=["Pclass"."Sex"."Embarked"])
x_datas["Age"] / =100
x_datas["Fare"] / =100

print(x_datas.isnull().sum())
print(x_datas.head())

# % %
seq = int(0.75* (len(x_datas)))

X ,Y = x_datas.iloc[:,1:],x_datas.iloc[:,0]
X_train,Y_train,X_test,Y_test = X[:seq],Y[:seq],X[seq:],Y[seq:]
Copy the code

Model building:

# % %
model = keras.models.Sequential()

model.add(keras.layers.Dense(64,input_dim = 12,activation="relu"))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(16,activation="relu"))
model.add(keras.layers.Dense(2,activation="softmax"))

model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

print(model.summary())
Copy the code

Model training and evaluation:

# % %
model.fit(X_train,Y_train,validation_split=0.2,epochs=100,batch_size=50)

# % %
y = model.evaluate(X_test,Y_test)
print("test loss is %f, acc %f"%(y[0],y[1]))
model.save("model_100_1.h5")
Copy the code
  • Output result:
_________________________________________________________________
Layer (type)                 Output Shape              Param #================================================================= dense_1 (Dense) (None, 64) 832 _________________________________________________________________ dropout_1 (Dropout) (None, 64) 0 _________________________________________________________________ dense_2 (Dense) (None, 16) 1040 _________________________________________________________________ dense_3 (Dense) (None, 2) 34 = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = Total params: 1906 Trainable params: 1906 Non - trainable params: 0 _________________________________________________________________... Epoch 96/100 534/534 [= = = = = = = = = = = = = = = = = = = = = = = = = = = = = =] 0 s 80 us/step - loss: 0.3870 acc: 0.8277 - val_loss: 0.5083 - val_acc: 0.7612 Epoch 97/100 534/534 [= = = = = = = = = = = = = = = = = = = = = = = = = = = = = =] 0 s 80 us/step - loss: 0.3921 acc: 0.8352-VAL_loss: 0.5070-val_ACC: 0.7687 Epoch 98/100 534/534 [= = = = = = = = = = = = = = = = = = = = = = = = = = = = = =] 0 s 82 us/step - loss: 0.3940 acc: 0.8371 - val_loss: 0.5102 - val_acc: 0.7687 Epoch 99/100 534/534 [= = = = = = = = = = = = = = = = = = = = = = = = = = = = = =] 0 s 78 us/step - loss: 0.3996 acc: 0.8277-VAL_loss: 0.5106 - val_ACC: 0.7687 Epoch 100/100 534/534 [= = = = = = = = = = = = = = = = = = = = = = = = = = = = = =] 0 s 80 us/step - loss: 0.3892 acc: 0.8352 - val_loss: 0.5082 - val_acc: 0.7612 223/223 [= = = = = = = = = = = = = = = = = = = = = = = = = = = = = =] 0 s 63 us/steptestLoss is 0.389338 and ACC 0.829596Copy the code
  • Complete code:
# % %
import tensorflow as tf
import keras
import pandas as pd
import numpy as np

data = pd.read_csv("titanic/train.csv")
print(data.head())
print(data.describe())
# % %
strs = "Survived Pclass Sex Age SibSp Parch Fare Embarked"
clos = strs.split("")
print(clos)
# % %
x_datas = data[clos]
print(x_datas.head())
# % %
print(x_datas.isnull().sum())

# % %
x_datas["Age"] = x_datas["Age"].fillna(x_datas["Age"].mean())
x_datas["Embarked"] = x_datas["Embarked"].fillna(x_datas["Embarked"].mode()[0])


#x_datas["Sex"] = pd.get_dummies(x_datas["Sex"])
x_datas = pd.get_dummies(x_datas,columns=["Pclass"."Sex"."Embarked"])
x_datas["Age"] / =100
x_datas["Fare"] / =100

print(x_datas.isnull().sum())
print(x_datas.head())

# % %
seq = int(0.75* (len(x_datas)))

X ,Y = x_datas.iloc[:,1:],x_datas.iloc[:,0]
X_train,Y_train,X_test,Y_test = X[:seq],Y[:seq],X[seq:],Y[seq:]


# % %
model = keras.models.Sequential()

model.add(keras.layers.Dense(64,input_dim = 12,activation="relu"))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(16,activation="relu"))
model.add(keras.layers.Dense(2,activation="softmax"))

model.compile(loss="sparse_categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

print(model.summary())

# % %
model.fit(X_train,Y_train,validation_split=0.2,epochs=100,batch_size=50)

# % %
y = model.evaluate(X_test,Y_test)
print("test loss is %f, acc %f"%(y[0],y[1]))
model.save("model_100_1.h5")
Copy the code