import pandas as pd
from sklearn.model_selection import train_test_split
#ファイルを読み込む
df = pd.read_csv("newsCorpora.csv", sep="\t", header=None,
names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])
#各カラムのデータ数、欠損値、オブジェクトを確認する
df.info()
#情報源(publisher)が”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, “Daily Mail”の事例(記事)のみを抽出する
df = df[df["PUBLISHER"].isin(["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"])]
df = df[["TITLE", "CATEGORY"]]
#学習、検証、評価データに分割する
train_full, test = train_test_split(df, test_size=0.1, shuffle=True)
train, valid = train_test_split(train_full, test_size=0.1, shuffle=True)
#学習、検証、評価データをファイルに保存する
train.to_csv("train.txt", sep="\t", index=False, header=None)
valid.to_csv("valid.txt", sep="\t", index=False, header=None)
test.to_csv("test.txt", sep="\t", index=False, header=None)
print("\n【CATEGORYの事例数】")
print("train\n", "-"*50,"\n", train["CATEGORY"].value_counts())
print()
print("valid\n", "-"*50,"\n", valid["CATEGORY"].value_counts())
print()
print("test\n", "-"*50,"\n", test["CATEGORY"].value_counts())