わいす工場 SIGNATE SIGNATEトライ -国税調査予測2- 1st submission

SIGNATEトライ -国税調査予測2- 1st submission


かなり更新にかかってしまいました。本来はステップを全て記述したかったところですが、都合により(忘れた)最終的に0.8429412で190/391となったコードを簡単な解説とともに紹介しようと思います。

#お馴染みの必要なライブラリ、ファイルの読み込み
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

#説明変数を再見直(ver1より)

#sex->変更なし
#workclass->3段階から5段階へ分割
#education->3段階から5段階へ分割->数値で評価->10th以下は一まとめにして、それ以外は数値変換
#material-status->2段階から3段階へ分割
#occupation		3段階から6段階へ分割->数値で評価(線形的に変化)
#relation-ship->変更なし
#race->データ母数の偏りが大きい。消去する
#native-countr->データ母数の偏りが大きい。消去する
#education-num->ver10同様に3分割

combine = [train_df, test_df]
dataset = combine

for dataset in combine:
#dataset['occupation'] = dataset['occupation'].replace(["Exec-managerial","Prof-specialty"], 'biggest')
#dataset['occupation'] = dataset['occupation'].replace(["Sales", "Tech-support"], 'big')
#dataset['occupation'] = dataset['occupation'].replace(["Transport-moving","Protective-serv"], 'middle-b')
#dataset['occupation'] = dataset['occupation'].replace(["Craft-repair","Farming-fishing","Machine-op-inspct"], 'middle-s')
#dataset['occupation'] = dataset['occupation'].replace(["Craft-repair"], 'middle-ss')
#dataset['occupation'] = dataset['occupation'].replace(["?", "Adm-clerical"], 'small')
#dataset['occupation'] = dataset['occupation'].replace(["Handlers-cleaners","Other-service"], 'smallest')
    dataset["marital-status"] = dataset["marital-status"].replace(["Married-civ-spouse"], "big")
    dataset["marital-status"] = dataset["marital-status"].replace(["Divorced", "Never-married",  "Separated"], "middle")
    dataset["marital-status"] = dataset["marital-status"].replace(["Widowed"], "small")
#dataset["education"] = dataset["education"].replace(["Prof-school"], "biggest")
#dataset["education"] = dataset["education"].replace(["Bachelors", "Masters"], "big")
#dataset["education"] = dataset["education"].replace(["Assoc-acdm","Some-college"], "middle")
#dataset["education"] = dataset["education"].replace(["Assoc-voc","HS-grad"], "middle-small")
#dataset["education"] = dataset["education"].replace(["10th", "11th", "12th", "1st-4th", "5th-6th", "7th-8th","9th"],"small" )
#dataset["workclass"] = dataset["workclass"].replace(["Self-emp-inc"], "big")
#dataset["workclass"] = dataset["workclass"].replace(["Federal-gov","Self-emp-not-inc"], "middle")
#dataset["workclass"] = dataset["workclass"].replace(["Local-gov","State-gov"], "middle-low")
#dataset["workclass"] = dataset["workclass"].replace(["Private"], "small-low")
#dataset["workclass"] = dataset["workclass"].replace(["?"], "small")
    dataset["relationship"] = dataset["relationship"].replace(["Husband", "Wife"], "big")
    dataset["relationship"] = dataset["relationship"].replace(["Not-in-family", "Other-relative","Own-child", "Unmarried"], "small")

#ageはoverとunderで分割(そのまま)
train_df["age_g"] = "aa"
test_df["age_g"] = "aa"

#education-numを3分割
#train_df["edu_g"] = "mid"
#test_df["edu_g"] = "mid"

#train_df['edu_g'].where(train_df['education-num'] > 8, "low", inplace=True)
#train_df['edu_g'].where(train_df['education-num'] < 13, "high", inplace=True)

#test_df['edu_g'].where(test_df['education-num'] > 8, "low", inplace=True)
#test_df['edu_g'].where(test_df['education-num'] < 13, "high", inplace=True)

train_df['age_g'].where(train_df['age'] > 35, "under", inplace=True)
train_df['age_g'].where(train_df['age'] <= 35, "over", inplace=True)

test_df['age_g'].where(test_df['age'] > 35, "under", inplace=True)
test_df['age_g'].where(test_df['age'] <= 35, "over", inplace=True)
    
#age, native-countryを削除
train_df = train_df.drop(["age", "race", "native-country","fnlwgt"], axis=1)
test_df = test_df.drop(["age","race",  "native-country","fnlwgt"], axis=1)

#fnlwgtをダミー化するため、objectに変換
#train_df["fnlwgt"]= train_df["fnlwgt"].astype('object')
#test_df["fnlwgt"]= test_df["fnlwgt"].astype('object')

#age以外をダミー化する
# train_df = pd.get_dummies(train_df)
# test_df = pd.get_dummies(test_df)

コメントを残す

メールアドレスが公開されることはありません。

CAPTCHA


Related Post