Decision tree based ID3 algorithm

Decision tree based ID3 algorithm and using an appropriate data set for building the decision tree

Program->

import pandas as pd
import numpy as np
import sys

from pprint import pprint

dataset =pd.read_csv("abhi.csv",names=['outlook','temp','humidity','wind','play',])

#====entropy====
def entropy(target_col):
elements,counts =np.unique(target_col,return_counts = True)
entropy=np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))for i in range(len(elements))])
return entropy

#====infogain====

def infoGain(data,split_attribute_name,target_name="play"):
total_entropy=entropy(data[target_name])
vals,counts=np.unique(data[split_attribute_name],return_counts=True)
Weighted_Entropy= np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range (len(vals))])
Information_Gain = total_entropy-Weighted_Entropy
return Information_Gain

#====ID3====
def ID3(data,originaldata,features,target_attribute_name="play",parent_node_class=None):
if len(np.unique(data[target_attribute_name]))<=1:
return np.unique(data[target_attribute_name])[0]
elif len(data)==0:
return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
elif len(features)==0:
return parent_node_class
else:
parent_node_class=np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]

item_values=[infoGain(data,feature,target_attribute_name)for feature in features]
best_feature_index=np.argmax(item_values)
best_feature=features[best_feature_index]
tree={best_feature:{}}
features=[i for i in features if i!=best_feature]
for value in np.unique(data[best_feature]):
value=value
sub_data=data.where(data[best_feature]==value).dropna()
subtree=ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
tree[best_feature][value]=subtree
return (tree)
#========
def predict(query,tree,default=1):
for key in list(query.keys()):
if key in list(tree.keys()):
try:
result= tree[key][query[key]]
except:
return default
result= tree[key][query[key]]
if isinstance(result,dict):
return predict(query,result)
else:
return result
#==========
def train_test_split(dataset):
training_data=dataset.iloc[:].reset_index(drop=True)
testing_data=dataset.iloc[11:].reset_index(drop=True)
return training_data,testing_data

training_data=train_test_split(dataset)[0]
testing_data=train_test_split(dataset)[1]
#==========
def test(data,tree):
queries=data.iloc[:,:-1].to_dict(orient="records")

predicted=pd.DataFrame(columns=["predicted"])

for i in range(len(data)):
predicted.loc[i,"predicted"]=predict(queries[i],tree,1.0)
print('The predcition accuracy is:',(np.sum(predicted["predicted"]==data["play"])/len(data))*100,'%')

tree=ID3(training_data,training_data,training_data.columns[:-1])
pprint(tree)
test(testing_data,tree)
#========================

Dataset->

sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no

Output->

{'outlook': {'overcast': 'yes',
             'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
The predcition accuracy is: 100.0 %

Search This Blog

Arya Drj

Decision tree based ID3 algorithm

Comments

Post a Comment

Popular posts from this blog

Invalid syntax , perhaps you forgot a comma? Error in Python

MAD Project (Mobile Android Application Development)

How to run PL/SQL Code With Command Line?