Decision tree based ID3 algorithm

Decision tree based ID3 algorithm and using an appropriate data set for building the decision tree

Program->

import pandas as pd
import numpy as np
import sys


from pprint import pprint

dataset =pd.read_csv("abhi.csv",names=['outlook','temp','humidity','wind','play',])

#====entropy====
def entropy(target_col):
    elements,counts =np.unique(target_col,return_counts = True)
    entropy=np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))for i in range(len(elements))])
    return entropy

#====infogain====

def infoGain(data,split_attribute_name,target_name="play"):
    total_entropy=entropy(data[target_name])
    vals,counts=np.unique(data[split_attribute_name],return_counts=True)
    Weighted_Entropy= np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range (len(vals))])
    Information_Gain = total_entropy-Weighted_Entropy
    return Information_Gain

#====ID3====
def ID3(data,originaldata,features,target_attribute_name="play",parent_node_class=None):
    if len(np.unique(data[target_attribute_name]))<=1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]
    elif len(features)==0:
        return parent_node_class
    else:
        parent_node_class=np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]

    item_values=[infoGain(data,feature,target_attribute_name)for feature in features]
    best_feature_index=np.argmax(item_values)
    best_feature=features[best_feature_index]
    tree={best_feature:{}}
    features=[i for i in features if i!=best_feature]
    for value in np.unique(data[best_feature]):
        value=value
        sub_data=data.where(data[best_feature]==value).dropna()
        subtree=ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
        tree[best_feature][value]=subtree
    return (tree)
#========
def predict(query,tree,default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result= tree[key][query[key]]
            except:
                return default
            result= tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result
#==========           
def train_test_split(dataset):
    training_data=dataset.iloc[:].reset_index(drop=True)
    testing_data=dataset.iloc[11:].reset_index(drop=True)
    return training_data,testing_data

training_data=train_test_split(dataset)[0]
testing_data=train_test_split(dataset)[1]
#==========
def test(data,tree):
    queries=data.iloc[:,:-1].to_dict(orient="records")
    
    predicted=pd.DataFrame(columns=["predicted"])
    
    for i in range(len(data)):
        predicted.loc[i,"predicted"]=predict(queries[i],tree,1.0)
    print('The predcition accuracy is:',(np.sum(predicted["predicted"]==data["play"])/len(data))*100,'%')
        
tree=ID3(training_data,training_data,training_data.columns[:-1])
pprint(tree)
test(testing_data,tree)
#========================

Dataset->

sunny,hot,high,weak,no
sunny,hot,high,strong,no
overcast,hot,high,weak,yes
rain,mild,high,weak,yes
rain,cool,normal,weak,yes
rain,cool,normal,strong,no
overcast,cool,normal,strong,yes
sunny,mild,high,weak,no
sunny,cool,normal,weak,yes
rain,mild,normal,weak,yes
sunny,mild,normal,strong,yes
overcast,mild,high,strong,yes
overcast,hot,normal,weak,yes
rain,mild,high,strong,no

Output->    

{'outlook': {'overcast': 'yes',
             'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
The predcition accuracy is: 100.0 %

Comments

Popular posts from this blog

How to set image in carousel using flask?

Invalid syntax , perhaps you forgot a comma? Error in Python

Cyber Security Capsule