Monitoring/test_extract.py
2025-07-03 15:09:43 +04:00

100 lines
3.5 KiB
Python

import os
import json
from pathlib import Path
def extract_names_recursive(obj, names_list, visited=None):
"""
Recursively extract all 'meta.name' values from a nested JSON object.
Args:
obj: The JSON object or list to process
names_list: List to append found names to
visited: Set of object ids already visited (to prevent infinite recursion)
"""
if visited is None:
visited = set()
# Skip already visited objects or non-container types
if not isinstance(obj, (dict, list)) or id(obj) in visited:
return
# Mark this object as visited
visited.add(id(obj))
if isinstance(obj, dict):
# Check if this object has a meta.name field
if 'meta' in obj and isinstance(obj['meta'], dict) and 'name' in obj['meta']:
name = obj['meta']['name']
if name and isinstance(name, str):
names_list.append(name)
# Check for children array and process only this key specifically
if 'children' in obj and isinstance(obj['children'], list):
for child in obj['children']:
extract_names_recursive(child, names_list, visited)
# Only process a few key dictionary values that might contain component definitions
keys_to_process = ['root', 'props', 'custom']
for key in keys_to_process:
if key in obj:
extract_names_recursive(obj[key], names_list, visited)
elif isinstance(obj, list):
# Process only the first 1000 items to prevent excessive recursion
for item in obj[:1000]:
extract_names_recursive(item, names_list, visited)
def load_scada_names(repo_path):
"""
Recursively find all JSON files in a repository and extract all component names.
Names can be found in 'meta.name' fields at both the root level and in nested children.
Args:
repo_path (str): Path to the repository
Returns:
list: List of SCADA names extracted from JSON files
"""
names = []
repo_dir = Path(repo_path)
# Find all JSON files recursively
json_files = list(repo_dir.glob('**/*.json'))
print(f"Found {len(json_files)} JSON files")
for json_file in json_files:
try:
with open(json_file, 'r') as f:
data = json.load(f)
# Store the count before extraction
count_before = len(names)
# Extract names recursively from the JSON structure
extract_names_recursive(data, names)
# Print how many names were found in this file
count_after = len(names)
if count_after > count_before:
print(f"Found {count_after - count_before} names in {json_file}")
except json.JSONDecodeError:
print(f"Invalid JSON in {json_file}")
continue
except Exception as e:
print(f"Error processing {json_file}: {str(e)}")
continue
return names
# Test with the specified path
if __name__ == "__main__":
repo_path = "project/clones/9820af836d9854563e0e495ca1541de48aefd95e"
names = load_scada_names(repo_path)
print(f"\nTotal names found: {len(names)}")
# Print some example names if any were found
if names:
print("\nExample names:")
for name in sorted(names)[:20]: # First 20 names alphabetically
print(f"- {name}")