JsonParserUtil.java

package com.kapil.verbametrics.util;

import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/**
 * Utility class for parsing JSON data using Jackson library.
 * Provides robust JSON parsing capabilities for the application.
 *
 * @author Kapil Garg
 */
@Component
public class JsonParserUtil {

    private static final Logger LOGGER = LoggerFactory.getLogger(JsonParserUtil.class);

    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

    /**
     * Parse training data from JSON string.
     * Handles JSON array format with text, label, and features fields.
     *
     * @param trainingDataJson the training data as JSON string
     * @return parsed training data
     */
    public static List<Map<String, Object>> parseTrainingData(String trainingDataJson) {
        try {
            String json = cleanJsonString(trainingDataJson);
            if (json.isEmpty()) {
                throw new IllegalArgumentException("Training data is empty");
            }
            String[] lines = json.split("\r?\n");
            boolean looksLikeJsonl = false;
            int nonEmptyLineCount = 0;
            for (String line : lines) {
                String trimmed = cleanJsonString(line);
                if (!trimmed.isEmpty()) {
                    nonEmptyLineCount++;
                    try {
                        // Try to parse the line as a JSON object to validate
                        OBJECT_MAPPER.readTree(trimmed);
                        looksLikeJsonl = trimmed.startsWith("{") && trimmed.endsWith("}");
                    } catch (Exception e) {
                        looksLikeJsonl = false;
                        break;
                    }
                }
            }
            if (looksLikeJsonl && nonEmptyLineCount >= 2) {
                ArrayList<Map<String, Object>> list = new ArrayList<>();
                for (String line : lines) {
                    String trimmed = cleanJsonString(line);
                    if (trimmed.isEmpty()) {
                        continue;
                    }
                    Map<String, Object> obj = OBJECT_MAPPER.readValue(trimmed, new TypeReference<>() {
                    });
                    list.add(obj);
                }
                if (!list.isEmpty()) {
                    return list;
                }
            }
            if (json.startsWith("[") && json.endsWith("]")) {
                String normalized = json
                        .replaceAll("}\\s*[\\r\\n]+\\s*\\{", "},{")
                        .replaceAll(",\\s*]", "]")
                        .replaceAll("\\[\\s*,", "[");
                return OBJECT_MAPPER.readValue(normalized, new TypeReference<>() {
                });
            }
            if (json.startsWith("{") && json.endsWith("}")) {
                Map<String, Object> single = OBJECT_MAPPER.readValue(json, new TypeReference<>() {
                });
                return List.of(single);
            }
            String joined = json
                    .replaceAll("}\\n+\\s*\\{", "},{")
                    .replaceAll("}\\r+\\s*\\{", "},{");
            if (!joined.equals(json)) {
                String asArray = "[" + joined + "]";
                return OBJECT_MAPPER.readValue(asArray, new TypeReference<>() {
                });
            }
            throw new IllegalArgumentException("Training data must be a JSON array, object, or JSONL (one JSON object per line)");
        } catch (Exception e) {
            LOGGER.error("Failed to parse training data JSON", e);
            throw new RuntimeException("Failed to parse training data: " + e.getMessage(), e);
        }
    }

    /**
     * Cleans a JSON string by removing Unicode characters that can cause parsing issues.
     * Removes BOM (Byte Order Mark), zero-width spaces, and other invisible Unicode characters.
     *
     * @param input the input string to clean
     * @return the cleaned and trimmed string
     */
    private static String cleanJsonString(String input) {
        if (input == null) {
            return "";
        }
        return input
                .replace("\uFEFF", "")
                .replace("\u200B", "")
                .replace("\u200C", "")
                .replace("\u200D", "")
                .trim();
    }

    /**
     * Parse prediction data from JSON string.
     * Handles both single JSON object and JSON array formats.
     *
     * @param predictionDataJson the prediction data as JSON string
     * @return parsed prediction data as list
     */
    public static List<Map<String, Object>> parsePredictionData(String predictionDataJson) {
        try {
            String json = cleanJsonString(predictionDataJson);
            if (json.isEmpty()) {
                throw new IllegalArgumentException("Prediction data is empty");
            }
            if (json.startsWith("[") && json.endsWith("]")) {
                // Handle JSON array format
                return OBJECT_MAPPER.readValue(json, new TypeReference<>() {
                });
            } else if (json.startsWith("{") && json.endsWith("}")) {
                // Handle single JSON object format
                Map<String, Object> singleObject = OBJECT_MAPPER.readValue(json, new TypeReference<>() {
                });
                return List.of(singleObject);
            } else {
                throw new IllegalArgumentException("Prediction data must be a JSON object or array");
            }
        } catch (Exception e) {
            LOGGER.error("Failed to parse prediction data JSON", e);
            throw new RuntimeException("Failed to parse prediction data: " + e.getMessage(), e);
        }
    }

}