Monitoring/test_extract.py

import os
import json
from pathlib import Path

def extract_names_recursive(obj, names_list, visited=None):
    """
    Recursively extract all 'meta.name' values from a nested JSON object.

    Args:
        obj: The JSON object or list to process
        names_list: List to append found names to
        visited: Set of object ids already visited (to prevent infinite recursion)
    """
    if visited is None:
        visited = set()

    # Skip already visited objects or non-container types
    if not isinstance(obj, (dict, list)) or id(obj) in visited:
        return

    # Mark this object as visited
    visited.add(id(obj))

    if isinstance(obj, dict):
        # Check if this object has a meta.name field
        if 'meta' in obj and isinstance(obj['meta'], dict) and 'name' in obj['meta']:
            name = obj['meta']['name']
            if name and isinstance(name, str):
                names_list.append(name)

        # Check for children array and process only this key specifically
        if 'children' in obj and isinstance(obj['children'], list):
            for child in obj['children']:
                extract_names_recursive(child, names_list, visited)

        # Only process a few key dictionary values that might contain component definitions
        keys_to_process = ['root', 'props', 'custom']
        for key in keys_to_process:
            if key in obj:
                extract_names_recursive(obj[key], names_list, visited)

    elif isinstance(obj, list):
        # Process only the first 1000 items to prevent excessive recursion
        for item in obj[:1000]:
            extract_names_recursive(item, names_list, visited)

def load_scada_names(repo_path):
    """
    Recursively find all JSON files in a repository and extract all component names.
    Names can be found in 'meta.name' fields at both the root level and in nested children.

    Args:
        repo_path (str): Path to the repository

    Returns:
        list: List of SCADA names extracted from JSON files
    """
    names = []
    repo_dir = Path(repo_path)

    # Find all JSON files recursively
    json_files = list(repo_dir.glob('**/*.json'))
    print(f"Found {len(json_files)} JSON files")

    for json_file in json_files:
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)

            # Store the count before extraction
            count_before = len(names)

            # Extract names recursively from the JSON structure
            extract_names_recursive(data, names)

            # Print how many names were found in this file
            count_after = len(names)
            if count_after > count_before:
                print(f"Found {count_after - count_before} names in {json_file}")

        except json.JSONDecodeError:
            print(f"Invalid JSON in {json_file}")
            continue
        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")
            continue

    return names

# Test with the specified path
if __name__ == "__main__":
    repo_path = "project/clones/9820af836d9854563e0e495ca1541de48aefd95e"
    names = load_scada_names(repo_path)
    print(f"\nTotal names found: {len(names)}")

    # Print some example names if any were found
    if names:
        print("\nExample names:")
        for name in sorted(names)[:20]:  # First 20 names alphabetically
            print(f"- {name}")