package openai import ( "context" "encoding/json" "fmt" "strings" "github.com/food-ai/backend/internal/adapters/ai" ) // langNames maps ISO 639-1 codes to English language names used in AI prompts. var langNames = map[string]string{ "en": "English", "ru": "Russian", "es": "Spanish", "de": "German", "fr": "French", "it": "Italian", "pt": "Portuguese", "zh": "Chinese", "ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi", } // RecognizeReceipt uses the vision model to extract food items from a receipt photo. func (c *Client) RecognizeReceipt(ctx context.Context, imageBase64, mimeType, lang string) (*ai.ReceiptResult, error) { langName := langNames[lang] if langName == "" { langName = "English" } prompt := fmt.Sprintf(`You are an OCR system for grocery receipts. Analyse the receipt photo and extract a list of food products. For each product determine: - name: product name (remove article codes, extra symbols) - quantity: amount (number) - unit: unit (g, kg, ml, l, pcs, pack) - category: dairy | meat | produce | bakery | frozen | beverages | other - confidence: 0.0–1.0 Skip items that are not food (household chemicals, tobacco, alcohol). Items with unreadable text — add to unrecognized. Return all text fields (name) in %s. Return ONLY valid JSON without markdown: { "items": [ {"name": "...", "quantity": 1, "unit": "l", "category": "dairy", "confidence": 0.95} ], "unrecognized": [ {"raw_text": "...", "price": 89.0} ] }`, langName) text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType) if err != nil { return nil, fmt.Errorf("recognize receipt: %w", err) } var result ai.ReceiptResult if err := parseJSON(text, &result); err != nil { return nil, fmt.Errorf("parse receipt result: %w", err) } if result.Items == nil { result.Items = []ai.RecognizedItem{} } if result.Unrecognized == nil { result.Unrecognized = []ai.UnrecognizedItem{} } return &result, nil } // RecognizeProducts uses the vision model to identify food items in a photo (fridge, shelf, etc.). func (c *Client) RecognizeProducts(ctx context.Context, imageBase64, mimeType, lang string) ([]ai.RecognizedItem, error) { langName := langNames[lang] if langName == "" { langName = "English" } prompt := fmt.Sprintf(`You are a food product recognition system. Look at the photo and identify all visible food products. For each product estimate: - name: product name - quantity: approximate amount (number) - unit: unit (g, kg, ml, l, pcs) - category: dairy | meat | produce | bakery | frozen | beverages | other - confidence: 0.0–1.0 Food products only. Skip empty packaging and inedible objects. Return all text fields (name) in %s. Return ONLY valid JSON without markdown: { "items": [ {"name": "...", "quantity": 10, "unit": "pcs", "category": "dairy", "confidence": 0.9} ] }`, langName) text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType) if err != nil { return nil, fmt.Errorf("recognize products: %w", err) } var result struct { Items []ai.RecognizedItem `json:"items"` } if err := parseJSON(text, &result); err != nil { return nil, fmt.Errorf("parse products result: %w", err) } if result.Items == nil { return []ai.RecognizedItem{}, nil } return result.Items, nil } // RecognizeDish uses the vision model to identify a dish and estimate its nutritional content. // Returns 3–5 ranked candidates so the user can correct mis-identifications. func (c *Client) RecognizeDish(ctx context.Context, imageBase64, mimeType, lang string) (*ai.DishResult, error) { prompt := `You are a dietitian and culinary expert. Look at the dish photo and suggest 3 to 5 possible dishes it could be. Even if the first option is obvious, add 2–4 alternative dishes with lower confidence. For each candidate specify: - dish_name: dish name - weight_grams: approximate portion weight in grams (estimate from photo) - calories: calories for this portion (kcal) - protein_g, fat_g, carbs_g: macros for this portion (grams) - confidence: certainty 0.0–1.0 Sort candidates by descending confidence. First — most likely. Return dish_name values in English. Return ONLY valid JSON without markdown: { "candidates": [ { "dish_name": "...", "weight_grams": 350, "calories": 520, "protein_g": 22, "fat_g": 26, "carbs_g": 48, "confidence": 0.88 }, { "dish_name": "...", "weight_grams": 350, "calories": 540, "protein_g": 20, "fat_g": 28, "carbs_g": 49, "confidence": 0.65 } ] }` text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType) if err != nil { return nil, fmt.Errorf("recognize dish: %w", err) } var result ai.DishResult if parseError := parseJSON(text, &result); parseError != nil { return nil, fmt.Errorf("parse dish result: %w", parseError) } if result.Candidates == nil { result.Candidates = []ai.DishCandidate{} } return &result, nil } // ClassifyIngredient uses the text model to classify an unknown food item // and build an ingredient_mappings record for it. func (c *Client) ClassifyIngredient(ctx context.Context, name string) (*ai.IngredientClassification, error) { prompt := fmt.Sprintf(`Classify the food product: "%s". Return ONLY valid JSON without markdown: { "canonical_name": "turkey_breast", "aliases": ["turkey breast"], "translations": [ {"lang": "ru", "name": "грудка индейки", "aliases": ["грудка индейки", "филе индейки"]} ], "category": "meat", "default_unit": "g", "calories_per_100g": 135, "protein_per_100g": 29, "fat_per_100g": 1, "carbs_per_100g": 0, "storage_days": 3 }`, name) messages := []map[string]string{ {"role": "user", "content": prompt}, } text, err := c.generateContent(ctx, messages) if err != nil { return nil, fmt.Errorf("classify ingredient: %w", err) } var result ai.IngredientClassification if err := parseJSON(text, &result); err != nil { return nil, fmt.Errorf("parse classification: %w", err) } return &result, nil } // parseJSON strips optional markdown fences and unmarshals JSON. func parseJSON(text string, dst any) error { text = strings.TrimSpace(text) if strings.HasPrefix(text, "```") { text = strings.TrimPrefix(text, "```json") text = strings.TrimPrefix(text, "```") text = strings.TrimSuffix(text, "```") text = strings.TrimSpace(text) } return json.Unmarshal([]byte(text), dst) }