import os
import json
import pprint

import numpy as np

from typing import List, Dict, Any
from collections import defaultdict

try:
    import google.colab
    REPO_URL = "https://github.com/nd-cse-30124-fa25/cse-30124-homeworks.git"
    REPO_NAME = "cse-30124-homeworks"
    HW_FOLDER = "homework02" 

    # Clone repo if not already present
    if not os.path.exists(REPO_NAME):
        !git clone {REPO_URL}

    # cd into the homework folder
    %cd {REPO_NAME}/{HW_FOLDER}

except ImportError:
    pass



# Load in Alias Statement Data (all you have to do is define the path to your alias training statements)
alias_data_dir = 'alias_statements'      # Path to alias statement data
alias_data = defaultdict(list)                                                # Dictionary to store alias statements, format: {alias: [list of entries per alias]}

for filename in os.listdir(alias_data_dir):                            # For all files in the alias statement directory:
  if filename.endswith(".json"):                                       #     If file is .json format, then:
    file_path = os.path.join(alias_data_dir, filename)                 #        Build the full file path string

    with open(file_path, 'r') as f:                                    #        Open file at file_path:
        data = json.load(f)                                            #            Load json content into a dict (hint: json.load() is very useful here)
        alias_data[data['name']].append(data)                          #            Store data in alias_data dict


# Load suspect statement data (all you have to do is define the path to your suspect statement data)
suspect_data_dir = 'suspect_statements'           # Path to suspect statement data
suspect_data = {}                                                             # Dictionary to store suspect statement data, format:  {suspect name: {json data}}

for filename in os.listdir(suspect_data_dir):                          # For all files in the suspect data directory:
    if filename.endswith(".json"):                                     #     If file is .json format, then:
      with open(os.path.join(suspect_data_dir, filename), 'r') as f:   #         Open file at file_path
          data = json.load(f)                                          #         Load json file content into a dict
          suspect_name = data['name']                                  #         Get suspects name
          suspect_data[suspect_name] = data                            #         Store suspects data in suspect_data dict

# Print Resulting Data
print("ALIAS DATA")
pprint.pprint(alias_data)

print("SUSCPECT DATA")
pprint.pprint(suspect_data)

# TODO: Some of the activities seem related to the weapons found at the party, I wonder if we can use that to improve our decision tree.
# TODO: Define Activity-to-Weapon Mapping
# Note: There should be 3 activities per weapon

activity_weapon_mapping = {
    'Inflating balloons': 'Gas',
    'Ripping wippets': 'Gas',
    #TODO: Add a third Gas activity

    'Whittling a stick': 'Knife',
    'Peeling an orange': 'Knife',
    'Opening letters': 'Knife',

    'Tampering with drinks': 'Poison',
    'Rinsing a glass carefully': 'Poison',
    #TODO: Add a third Poison activity

    'Tying decorative knots': 'Rope',
    'Raising a painting': 'Rope',
    'Taking her dog for a walk': 'Rope',

    'Inspecting a family heirloom': 'Firearm',
    'Opening a safe': 'Firearm',
    'Talking about skeet shooting': 'Firearm',

    'Bragging about their Birkin': 'Bag',
    'Complaining about TSA going through their purse': 'Bag',
    #TODO: Add a third Bag activity
}

# TODO: finish feature extraction function
def extract_features(data, is_alias=True):
    '''
    Function converts the categorical data we have into numerical data for the decision tree
    '''

    rooms = ['Study', 'Dining Room', 'Kitchen', 'Pantry', 'Living Room', 'Bathroom']    # List of possible rooms where a suspect might have been
    weapons = ['Poison', 'Knife', 'Gas', 'Rope', 'Bag', 'Firearm']                      # List of possible weapons

    features = {}                                  # Dictionary to store extracted/calculated features

    room_time = {room: 0 for room in rooms}        # Dictionary to track time in each room, start at 0
    activities = []                                # List to store activities done by suspect

    total_time = 0                                 # Keep track of total observations suspet has

    # TODO: Count the amount of times the suspect or alias visits each room and a list of activities they do

    # TODO: Calculate the percentage of time the suspect or alias spends in each room

    # TODO: Create features indicating if an activity they've done suggests a weapon
    
    return features

import pandas as pd

alias_features_dict = {}                                                    # Make empty dict to store extracted features for each alias
for alias_name, alias_samples in alias_data.items():                        # Loop through each alias in alias_data
    alias_features_dict[alias_name] = []                                        # Make empty list for each alias to store extracted features
    for alias_sample in alias_samples:                                          # Loop through alias statements belonging to the same person
        features = extract_features(alias_sample, is_alias=True)                    # Call extract_features function
        alias_features_dict[alias_name].append(features)                            # Append extracted features dictionary

suspect_features_dict = {}                                                  # empty dict to store extracted features for each suspect
for suspect, data in suspect_data.items():                                  # Loop through each suspect in suspect data
    features = extract_features(data, is_alias=False)                           # Call extract features function
    suspect_features_dict[suspect] = features                                   # Store extracted features in suspect_features_dict

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

# TODO: Define training and testing data (alias features)

# TODO: Train Decision Tree Classifier
# use arguments criterion='entropy', random_state=42, max_depth=4, min_samples_split=4

# EXTRA: Print the decision tree structure
#tree_rules = export_text(clf, feature_names=[f'time_in_{room}' for room in rooms] + [f'suggests_{weapon}' for weapon in weapons])
#print(tree_rules)

X_test = [list(x.values()) for x in suspect_features_dict.values()]
probabilities = clf.predict_proba(X_test)                 # returns a 2D matrix of probabilities
                                                          # each row is a suspect, each col is the probability that the suspect matches one of the aliases

# Print Results
print("\nProbability of Each Suspect Matching Each Alias:\n")
print(f"{clf.classes_}")

for suspect_name, probs in zip(suspect_features_dict.keys(), probabilities):
    formatted_probs = [f"{v:.4f}" for v in probs]
    print(f'{suspect_name}: {formatted_probs}')

# Store decision tree results to be used later in bayesian reasoning (Task 5)
aliases = list(clf.classes_)
decision_tree_results = {s_n: p for s_n, p in zip(suspect_features_dict.keys(), probabilities)}
# Notes:
#    dict contains suspect names as keys and their corresponding probability distributions as values
#    aka each suspect has list of probs (one prob per alias) for how likely it is that said suspect matches said alias

import numpy as np
from typing import List, Dict, Tuple

# Initialize blueprint
blueprint = {
    "rooms": {
        "Study": { "id": 0, "adjacent_to": ["Study", "Dining Room", "Living Room"] },
        "Dining Room": { "id": 1, "adjacent_to": ["Study", "Dining Room", "Kitchen", "Living Room"] },
        "Kitchen": { "id": 2, "adjacent_to": ["Dining Room", "Kitchen", "Pantry"] },
        "Pantry": { "id": 3, "adjacent_to": ["Kitchen", "Pantry"] },
        "Living Room": { "id": 4, "adjacent_to": ["Study", "Dining Room", "Living Room", "Bathroom"] },
        "Bathroom": { "id": 5, "adjacent_to": ["Living Room", "Bathroom"] }
    },
    "transition_probabilities": {
        0: { 0: 0.4, 1: 0.3, 4: 0.3 },
        1: { 0: 0.25, 1: 0.25, 2: 0.25, 4: 0.25 },
        2: { 1: 0.4, 2: 0.4, 3: 0.2 },
        3: { 2: 0.8, 3: 0.2 },
        4: { 0: 0.25, 1: 0.25, 4: 0.25, 5: 0.25 },
        5: { 4: 0.7, 5: 0.3 }
    }
}

locations = blueprint['rooms']
n_states = len(locations)

initial = np.ones(n_states) / n_states

def weight_room_matrix(time: str, gps_data: List[Dict], witness_statements: Dict[str, Dict]) -> np.ndarray:
    """
    Create emission probability matrix using objective evidence (GPS and witnesses).
    """

    # Start with uniform distribution
    emission = np.ones(n_states) / n_states

    # TODO: Weight based on GPS evidence
    # Conceptually:
    #    Check if there is GPS data for specified time
    #    If yes, then:
    #        get location and signal strengh
    #        reduce the prob of all rooms based on inverse of signal strength
    #        increase the prob of room matching GPS data based on signal strength

    # TODO: Weight based on witness evidence
    # Conceptually:
    #     Check witness statement for a given time
    #     Retrieve the reliability of witness, adjust emission matrix probs
    #     Probs are adjusted by:
    #         Reduce the likelihood of all rooms by (1 - reliability) and
    #         Increase the prob of the room where witness saw the suspect based on reliability measure

    # TODO: Normalize
    emission /= np.sum(emission)

    return emission

def viterbi(suspect_data: Dict, timestamps: List[str]) -> Tuple[List[str], float]:
    """
    Use Viterbi algorithm to find most likely sequence of true locations.
    Also returns the probability of the sequence.
    """
    # -------------------------------------
    # Step 3a: Get ready for forward pass
    #     1. Set up total_timesteps and num_rooms
    #     2. Initialize V and backptr matricies
    #     3. Compute emmissions probabilites matrix
    #     4. Initialize first timestep
    # -------------------------------------

    # TODO: Initialize total_timesteps, num_rooms, and matrices
    
    # Create emission matrices for all timestamps
    emissions = []
    for t in timestamps:
        emission_t = weight_room_matrix(t, suspect_data['gps_data'], suspect_data['witness_statements'])
        emissions.append(emission_t)

    # TODO: Initialize first timestep
    # Note: first time step (time 0) probs are initialized using initial state distribution and emission matrix for first time step

    # -------------------------------------
    # Step 3b: Do forward pass
    #     1. Iterate through each time step
    #     2. For each room, consider all possible previous rooms (adjacent rooms)
    #     3. Compute the prob of arriving in room from any adjacent room
    #     4. Store the best previous state in backptr (to use for backtracking later)
    #     5. Updates the Viterbi matrix (V) with the highest probability
    # -------------------------------------

    # -------------------------------------
    # Step 3c: Backtrack to reconstruct path
    #   1. Find most likely ending room
    #   2. Backtrack through the backptr matrix to reconstruct the path
    #   3. Reverse reconstructed path to correct order
    #   4. Convert room IDs to room names
    #   5. Compute the prob of sequence
    #   6. Return path and its probability
    # -------------------------------------

    # TODO: Initialize the path
    # Conceptually:
    #   V[-1] refers to the last row of the Viterbi matrix (contains the log probabilities of the suspect being in each room at the last time step)
    #   np.argmax(V[-1]) finds the index of the room with the highest probability.
    #   So current gets initialized to most likely final state and then appended to the path as the starting point

    # TODO: Reconstruct the path
    # Conceptually:
    #     traverse the backpointer matrix (backptr) in reverse to reconstruct the most likely path
    #     loop starts from the second-to-last time step (total_timesteps - 1) and goes backward to 1
    #.    backptr[time_step, current] gives the most likely previous room at time_step
    #     current is updated to this previous room and then added to the path

    # TODO: Reverse path and convert indicies to room names
    # Conceptually:
    #    Since we were backtracking from the last time step to the first, the path list is in reverse order.
    #    path.reverse() puts it in chronological order
    #    Then we need to convert the numerical room indices in reconstructed path back to their names using blueprint

    # TODO: Calculate sequence probability
    # Conceptually:
    #     V[-1] contains the log probs of ending in each room at the last time step
    #     np.max(V[-1]) gives the highest log prob
    #     np.exp(...) converts it back from log prob to actual prob.

    return room_path, sequence_prob

# Get number of states in HMM
locations = blueprint['rooms']
n_states = len(locations)

# Set up initial state distribution for HMM
initial = np.ones(n_states) / n_states

movement_results = {}

print("\nHMM Path Analysis:")
print("=" * 50)

# Loop through each suspect in dataset
for suspect_name, suspect in suspect_data.items():
    # Get all timestamps in chronological order (hint: sort timestamps to ensure chronological order)
    timestamps = []
    for entry in suspect['suspect_statement']:
        timestamps.append(entry['time'])
    timestamps.sort()

    # Run Viterbi to get most likely true path using GPS data and witness statements
    true_path, path_prob = viterbi(
        {
            'gps_data': suspect['gps_data'],
            'witness_statements': suspect['witness_statements']
        },
        timestamps
    )

    # Compare the true path (from Viterbi) and the suspect's stated path
    # Count the number of matches where the suspect's statement aligns with the computed true path
    # TODO: Get suspects stated path

    # TODO: Calculate matches between true and stated paths
    #     hint: Compare the true path (from Viterbi) and the suspect's stated path by
    #           counting the number of matches where the suspect's statement aligns with the computed true path

    agreement = matches / len(true_path)

    print(f"\n{suspect_name}:")
    print(f'\tAgreement of Statement Path with HMM Path: {agreement}, Number of Contradictions: {len(true_path) - matches}')

    path_agreement = {
        'path_probability': path_prob,
        'statement_agreement': agreement,
        'num_contradictions': len(true_path) - matches
    }

    movement_results[suspect_name] = path_agreement

def combine_evidence_bayesian(aliases, decision_tree_results, hmm_results) -> Dict[str, Dict]:
    """
    Combine all evidence, assuming that whomever is Mr. Green is lying!

    P(Alias|Evidence) ∝ P(Alias|Activities) * P(Alias|Truthfulness)
    """
    final_results = {}

    # We can assume that whomever is Mr. Green is lying
    # So low statement match with highest likelihood path → higher probability of being Mr. Green

    # TODO: Iterate through each suspect and calculate posterior probabilities

        # TODO: Retrieve decision tree probabilities for the current suspect

        # TODO: Retrieve path agreement score from the HMM results

        # TODO: For each alias, combine decision tree and HMM evidence

        # TODO: Normalize posteriors

    return final_results

# Call combine_evidence_bayesian to get final probabilities for each suspect being each alias
final_results = combine_evidence_bayesian(aliases, decision_tree_results, movement_results)

# Print Results
for suspect, results in final_results.items():
    print(f"{suspect}:")
    for alias, prob in results.items():
        print(f"  {alias}: {prob:.3f}")

Introduction to Artificial Intelligence - Homework Assignment 02 (20pts.)¶

Story Progression¶

Task 00: Collecting Suspect Statements and Data Setup (2 pts.)¶

Task 00-1: Description (2 pts.)¶

Collect Missing Suspect Data¶

Task 01-2: Description (0 pts.)¶

Load and create the necessary data¶

Task 01-2: Code (0 pts.)¶

Story Progression¶

Task 01: Decision Tree (7 pts.)¶

Task 01-1: Description (0 pts.)¶

Create Features for Data¶

Task 01-1-1: Code (1 pt.)¶

Task 01-1-2: Code (2 pts.)¶

Task 01-1-3: Code (0 pts.)¶

Task 01-2: Description (0 pts.)¶

Fit Decision Tree to Alias Data¶

Task 01-2: Code (1 pt.)¶

Task 01-3: Description (0 pts.)¶

Use Decision Tree to Classify Suspects as Aliases¶

Task 01-3: Code (0 pts.)¶

Task 01-3: Expected Output (1 pt.)¶

Task 01-4: Short Answer Questions (2 pts.)¶

Story Progression¶

TASK 02: Hidden Markov Models (7 pts.)¶

Task 02-1: Description (0 pts.)¶

Generate Emission Matrices¶

Task 02-1: Code (1 pt.)¶

Task 02-2: Description (0 pts.)¶

Viterbi Algorithm¶

Task 02-2: Code (2 pts.)¶

Task 02-3: Description (0 pts.)¶

Suspect Path Analysis¶

Task 02-3: Code (1 pt.)¶

Task 02-3: Expected Output (1 pt.)¶

Task 02-4: Short Answer Questions (2 pts.)¶

Story Progression¶

Task 03: Final Bayesian Analysis (4 pts)¶

Task 03-1: Description (0 pts.)¶

Evidence Combination Function¶

Task 03-1: Code (2 pts.)¶

Task 03-2: Description (0 pts.)¶

Final Evidence Combination and Printing¶

Task 03-2: Code (1 pt.)¶

Task 03-2: Expected Results (1 pt.)¶

Final Story Progression: Success!!¶