Files
food-ai/backend/internal/adapters/openai/recognition.go
dbastrikin 5c5ed25e5b feat: improved receipt recognition, batch product add, and scan UX
- Rewrite receipt OCR prompt: completes truncated names, preserves fat%
  and flavour attributes, extracts weight/volume from line, infers
  typical package sizes for solid goods with quantity_confidence field
- Add quantity_confidence to RecognizedItem, EnrichedItem, and
  ProductJobResultItem; propagate through item enricher and worker
- Replace per-item create loop with single POST /user-products/batch call
  from RecognitionConfirmScreen
- Rebuild RecognitionConfirmScreen: amber qty border for low
  quantity_confidence, tappable product name → catalog picker,
  sort items by confidence, full L10n (no hardcoded strings)
- Add timestamps (HH:mm / d MMM HH:mm) to recent scan chips
- Show close-app hint on ProductJobWatchScreen (queued + processing)
- Refresh recentProductJobsProvider on watch screen init so new job
  appears without a manual pull-to-refresh
- App-level WidgetsBindingObserver refreshes product and dish job lists
  on resume, fixing stale lists after background/foreground transitions
- Add 9 new L10n keys across all 12 locales

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-26 23:09:57 +02:00

233 lines
7.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package openai
import (
"context"
"encoding/json"
"fmt"
"strings"
"github.com/food-ai/backend/internal/adapters/ai"
)
// langNames maps ISO 639-1 codes to English language names used in AI prompts.
var langNames = map[string]string{
"en": "English", "ru": "Russian", "es": "Spanish",
"de": "German", "fr": "French", "it": "Italian",
"pt": "Portuguese", "zh": "Chinese", "ja": "Japanese",
"ko": "Korean", "ar": "Arabic", "hi": "Hindi",
}
// RecognizeReceipt uses the vision model to extract food items from a receipt photo.
func (c *Client) RecognizeReceipt(ctx context.Context, imageBase64, mimeType, lang string) (*ai.ReceiptResult, error) {
langName := langNames[lang]
if langName == "" {
langName = "English"
}
prompt := fmt.Sprintf(`You are an OCR system for grocery receipts.
Analyse the receipt photo and extract a list of food products.
Rules for each product:
NAME (confidence):
- Remove article codes, cashier codes (e.g. "1/72", "4607001234"), extra symbols.
- Complete obviously truncated OCR names: "Паштет шпро." → "Паштет шпротный",
"Паштет с говяжьей пече" → "Паштет с говяжьей печенью".
- Preserve meaningful product attributes: fat percentage ("3.2%%", "жирн. 9%%"),
flavour ("с гусиной печенью", "яблочный"), brand qualifiers ("ультрапастеризованное").
- confidence: your certainty that the name is correct (0.01.0).
QUANTITY + UNIT (quantity_confidence):
- If a weight or volume is written on the receipt line (e.g. "160г", "1л", "500 мл", "0.5кг"),
use it as quantity+unit. quantity_confidence = 0.91.0.
- If the count on the receipt is 1 and no weight/volume is stated, but the product is a
liquid (juice, milk, kefir, etc.) — infer 1 l and set quantity_confidence = 0.5.
- If the count is 1 and no weight is stated, but the product is a solid packaged good
(pâté, spreadable cheese, sausage, butter, hard cheese, etc.) — infer a typical
package weight in grams (e.g. pâté 100 g, spreadable cheese 180 g, butter 200 g)
and set quantity_confidence = 0.35.
- If the receipt explicitly states the quantity and unit (e.g. "2 кг", "3 шт"),
use them directly. quantity_confidence = 1.0.
- Never output quantity = 1 with unit = "g" unless the receipt explicitly says "1 г".
- unit must be one of: g, kg, ml, l, pcs, pack.
CATEGORY: dairy | meat | produce | bakery | frozen | beverages | other
Skip items that are not food (household chemicals, tobacco, alcohol, bags, services).
Items with unreadable text — add to unrecognized.
Return all text fields (name) in %s.
Return ONLY valid JSON without markdown:
{
"items": [
{"name": "...", "quantity": 160, "unit": "g", "category": "other", "confidence": 0.95, "quantity_confidence": 0.9}
],
"unrecognized": [
{"raw_text": "...", "price": 89.0}
]
}`, langName)
text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType)
if err != nil {
return nil, fmt.Errorf("recognize receipt: %w", err)
}
var result ai.ReceiptResult
if err := parseJSON(text, &result); err != nil {
return nil, fmt.Errorf("parse receipt result: %w", err)
}
if result.Items == nil {
result.Items = []ai.RecognizedItem{}
}
if result.Unrecognized == nil {
result.Unrecognized = []ai.UnrecognizedItem{}
}
return &result, nil
}
// RecognizeProducts uses the vision model to identify food items in a photo (fridge, shelf, etc.).
func (c *Client) RecognizeProducts(ctx context.Context, imageBase64, mimeType, lang string) ([]ai.RecognizedItem, error) {
langName := langNames[lang]
if langName == "" {
langName = "English"
}
prompt := fmt.Sprintf(`You are a food product recognition system.
Look at the photo and identify all visible food products.
For each product estimate:
- name: product name
- quantity: approximate amount (number)
- unit: unit (g, kg, ml, l, pcs)
- category: dairy | meat | produce | bakery | frozen | beverages | other
- confidence: 0.01.0
Food products only. Skip empty packaging and inedible objects.
Return all text fields (name) in %s.
Return ONLY valid JSON without markdown:
{
"items": [
{"name": "...", "quantity": 10, "unit": "pcs", "category": "dairy", "confidence": 0.9}
]
}`, langName)
text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType)
if err != nil {
return nil, fmt.Errorf("recognize products: %w", err)
}
var result struct {
Items []ai.RecognizedItem `json:"items"`
}
if err := parseJSON(text, &result); err != nil {
return nil, fmt.Errorf("parse products result: %w", err)
}
if result.Items == nil {
return []ai.RecognizedItem{}, nil
}
return result.Items, nil
}
// RecognizeDish uses the vision model to identify a dish and estimate its nutritional content.
// Returns 35 ranked candidates so the user can correct mis-identifications.
func (c *Client) RecognizeDish(ctx context.Context, imageBase64, mimeType, lang string) (*ai.DishResult, error) {
prompt := `You are a dietitian and culinary expert.
Look at the dish photo and suggest 3 to 5 possible dishes it could be.
Even if the first option is obvious, add 24 alternative dishes with lower confidence.
For each candidate specify:
- dish_name: dish name
- weight_grams: approximate portion weight in grams (estimate from photo)
- calories: calories for this portion (kcal)
- protein_g, fat_g, carbs_g: macros for this portion (grams)
- confidence: certainty 0.01.0
Sort candidates by descending confidence. First — most likely.
Return dish_name values in English.
Return ONLY valid JSON without markdown:
{
"candidates": [
{
"dish_name": "...",
"weight_grams": 350,
"calories": 520,
"protein_g": 22,
"fat_g": 26,
"carbs_g": 48,
"confidence": 0.88
},
{
"dish_name": "...",
"weight_grams": 350,
"calories": 540,
"protein_g": 20,
"fat_g": 28,
"carbs_g": 49,
"confidence": 0.65
}
]
}`
text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType)
if err != nil {
return nil, fmt.Errorf("recognize dish: %w", err)
}
var result ai.DishResult
if parseError := parseJSON(text, &result); parseError != nil {
return nil, fmt.Errorf("parse dish result: %w", parseError)
}
if result.Candidates == nil {
result.Candidates = []ai.DishCandidate{}
}
return &result, nil
}
// ClassifyIngredient uses the text model to classify an unknown food item
// and build an ingredient_mappings record for it.
func (c *Client) ClassifyIngredient(ctx context.Context, name string) (*ai.IngredientClassification, error) {
prompt := fmt.Sprintf(`Classify the food product: "%s".
Return ONLY valid JSON without markdown:
{
"canonical_name": "turkey_breast",
"aliases": ["turkey breast"],
"translations": [
{"lang": "ru", "name": "грудка индейки", "aliases": ["грудка индейки", "филе индейки"]}
],
"category": "meat",
"default_unit": "g",
"calories_per_100g": 135,
"protein_per_100g": 29,
"fat_per_100g": 1,
"carbs_per_100g": 0,
"storage_days": 3
}`, name)
messages := []map[string]string{
{"role": "user", "content": prompt},
}
text, err := c.generateContent(ctx, messages)
if err != nil {
return nil, fmt.Errorf("classify ingredient: %w", err)
}
var result ai.IngredientClassification
if err := parseJSON(text, &result); err != nil {
return nil, fmt.Errorf("parse classification: %w", err)
}
return &result, nil
}
// parseJSON strips optional markdown fences and unmarshals JSON.
func parseJSON(text string, dst any) error {
text = strings.TrimSpace(text)
if strings.HasPrefix(text, "```") {
text = strings.TrimPrefix(text, "```json")
text = strings.TrimPrefix(text, "```")
text = strings.TrimSuffix(text, "```")
text = strings.TrimSpace(text)
}
return json.Unmarshal([]byte(text), dst)
}