feat: implement Iteration 3 — product/receipt/dish recognition
Backend: - gemini/client.go: refactor to shared callGroq transport; add generateVisionContent using llama-3.2-11b-vision-preview model - gemini/recognition.go: RecognizeReceipt, RecognizeProducts, RecognizeDish (vision), ClassifyIngredient (text); shared parseJSON helper - ingredient/repository.go: add FuzzyMatch (wraps Search, returns best hit) - recognition/handler.go: POST /ai/recognize-receipt, /ai/recognize-products, /ai/recognize-dish; enrichItems with fuzzy match + AI classify fallback; parallel multi-image processing with deduplication - server.go + main.go: wire recognition handler under /ai routes Flutter: - pubspec.yaml: add image_picker ^1.1.0 - AndroidManifest.xml: add CAMERA and READ_EXTERNAL_STORAGE permissions - Info.plist: add NSCameraUsageDescription and NSPhotoLibraryUsageDescription - recognition_service.dart: RecognitionService wrapping /ai/* endpoints; RecognizedItem, ReceiptResult, DishResult models - scan_screen.dart: mode selector (receipt / products / dish / manual); image source picker; loading overlay; navigates to confirm or dish screen - recognition_confirm_screen.dart: editable list of recognized items; inline qty/unit editing; swipe-to-delete; batch-add to pantry - dish_result_screen.dart: dish name, KBZHU breakdown, similar dishes chips - app_router.dart: /scan, /scan/confirm, /scan/dish routes (no bottom nav) - products_screen.dart: FAB now shows bottom sheet with Manual / Scan options Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
221
backend/internal/gemini/recognition.go
Normal file
221
backend/internal/gemini/recognition.go
Normal file
@@ -0,0 +1,221 @@
|
||||
package gemini
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"context"
|
||||
)
|
||||
|
||||
// RecognizedItem is a food item identified in an image.
|
||||
type RecognizedItem struct {
|
||||
Name string `json:"name"`
|
||||
Quantity float64 `json:"quantity"`
|
||||
Unit string `json:"unit"`
|
||||
Category string `json:"category"`
|
||||
Confidence float64 `json:"confidence"`
|
||||
}
|
||||
|
||||
// UnrecognizedItem is text from a receipt that could not be identified as food.
|
||||
type UnrecognizedItem struct {
|
||||
RawText string `json:"raw_text"`
|
||||
Price float64 `json:"price,omitempty"`
|
||||
}
|
||||
|
||||
// ReceiptResult is the full result of receipt OCR.
|
||||
type ReceiptResult struct {
|
||||
Items []RecognizedItem `json:"items"`
|
||||
Unrecognized []UnrecognizedItem `json:"unrecognized"`
|
||||
}
|
||||
|
||||
// DishResult is the result of dish recognition.
|
||||
type DishResult struct {
|
||||
DishName string `json:"dish_name"`
|
||||
WeightGrams int `json:"weight_grams"`
|
||||
Calories float64 `json:"calories"`
|
||||
ProteinG float64 `json:"protein_g"`
|
||||
FatG float64 `json:"fat_g"`
|
||||
CarbsG float64 `json:"carbs_g"`
|
||||
Confidence float64 `json:"confidence"`
|
||||
SimilarDishes []string `json:"similar_dishes"`
|
||||
}
|
||||
|
||||
// IngredientClassification is the AI-produced classification of an unknown food item.
|
||||
type IngredientClassification struct {
|
||||
CanonicalName string `json:"canonical_name"`
|
||||
CanonicalNameRu string `json:"canonical_name_ru"`
|
||||
Category string `json:"category"`
|
||||
DefaultUnit string `json:"default_unit"`
|
||||
CaloriesPer100g *float64 `json:"calories_per_100g"`
|
||||
ProteinPer100g *float64 `json:"protein_per_100g"`
|
||||
FatPer100g *float64 `json:"fat_per_100g"`
|
||||
CarbsPer100g *float64 `json:"carbs_per_100g"`
|
||||
StorageDays int `json:"storage_days"`
|
||||
Aliases []string `json:"aliases"`
|
||||
}
|
||||
|
||||
// RecognizeReceipt uses the vision model to extract food items from a receipt photo.
|
||||
func (c *Client) RecognizeReceipt(ctx context.Context, imageBase64, mimeType string) (*ReceiptResult, error) {
|
||||
prompt := `Ты — OCR-система для чеков из продуктовых магазинов.
|
||||
|
||||
Проанализируй фото чека и извлеки список продуктов питания.
|
||||
Для каждого продукта определи:
|
||||
- name: название на русском языке (убери артикулы, коды, лишние символы)
|
||||
- quantity: количество (число)
|
||||
- unit: единица (г, кг, мл, л, шт, уп)
|
||||
- category: dairy | meat | produce | bakery | frozen | beverages | other
|
||||
- confidence: 0.0–1.0
|
||||
|
||||
Позиции, которые не являются едой (бытовая химия, табак, алкоголь) — пропусти.
|
||||
Позиции с нечитаемым текстом — добавь в unrecognized.
|
||||
|
||||
Верни ТОЛЬКО валидный JSON без markdown:
|
||||
{
|
||||
"items": [
|
||||
{"name": "Молоко 2.5%", "quantity": 1, "unit": "л", "category": "dairy", "confidence": 0.95}
|
||||
],
|
||||
"unrecognized": [
|
||||
{"raw_text": "ТОВ АРТИК 1ШТ", "price": 89.0}
|
||||
]
|
||||
}`
|
||||
|
||||
text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("recognize receipt: %w", err)
|
||||
}
|
||||
|
||||
var result ReceiptResult
|
||||
if err := parseJSON(text, &result); err != nil {
|
||||
return nil, fmt.Errorf("parse receipt result: %w", err)
|
||||
}
|
||||
if result.Items == nil {
|
||||
result.Items = []RecognizedItem{}
|
||||
}
|
||||
if result.Unrecognized == nil {
|
||||
result.Unrecognized = []UnrecognizedItem{}
|
||||
}
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// RecognizeProducts uses the vision model to identify food items in a photo (fridge, shelf, etc.).
|
||||
func (c *Client) RecognizeProducts(ctx context.Context, imageBase64, mimeType string) ([]RecognizedItem, error) {
|
||||
prompt := `Ты — система распознавания продуктов питания.
|
||||
|
||||
Посмотри на фото и определи все видимые продукты питания.
|
||||
Для каждого продукта оцени:
|
||||
- name: название на русском языке
|
||||
- quantity: приблизительное количество (число)
|
||||
- unit: единица (г, кг, мл, л, шт)
|
||||
- category: dairy | meat | produce | bakery | frozen | beverages | other
|
||||
- confidence: 0.0–1.0
|
||||
|
||||
Только продукты питания. Пустые упаковки и несъедобные предметы — пропусти.
|
||||
|
||||
Верни ТОЛЬКО валидный JSON без markdown:
|
||||
{
|
||||
"items": [
|
||||
{"name": "Яйца", "quantity": 10, "unit": "шт", "category": "dairy", "confidence": 0.9}
|
||||
]
|
||||
}`
|
||||
|
||||
text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("recognize products: %w", err)
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Items []RecognizedItem `json:"items"`
|
||||
}
|
||||
if err := parseJSON(text, &result); err != nil {
|
||||
return nil, fmt.Errorf("parse products result: %w", err)
|
||||
}
|
||||
if result.Items == nil {
|
||||
return []RecognizedItem{}, nil
|
||||
}
|
||||
return result.Items, nil
|
||||
}
|
||||
|
||||
// RecognizeDish uses the vision model to identify a dish and estimate its nutritional content.
|
||||
func (c *Client) RecognizeDish(ctx context.Context, imageBase64, mimeType string) (*DishResult, error) {
|
||||
prompt := `Ты — диетолог и кулинарный эксперт.
|
||||
|
||||
Посмотри на фото блюда и определи:
|
||||
- dish_name: название блюда на русском языке
|
||||
- weight_grams: приблизительный вес порции в граммах
|
||||
- calories: калорийность порции (приблизительно)
|
||||
- protein_g, fat_g, carbs_g: БЖУ на порцию
|
||||
- confidence: 0.0–1.0
|
||||
- similar_dishes: до 3 похожих блюд (для поиска рецептов)
|
||||
|
||||
Верни ТОЛЬКО валидный JSON без markdown:
|
||||
{
|
||||
"dish_name": "Паста Карбонара",
|
||||
"weight_grams": 350,
|
||||
"calories": 520,
|
||||
"protein_g": 22,
|
||||
"fat_g": 26,
|
||||
"carbs_g": 48,
|
||||
"confidence": 0.85,
|
||||
"similar_dishes": ["Паста с беконом", "Спагетти"]
|
||||
}`
|
||||
|
||||
text, err := c.generateVisionContent(ctx, prompt, imageBase64, mimeType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("recognize dish: %w", err)
|
||||
}
|
||||
|
||||
var result DishResult
|
||||
if err := parseJSON(text, &result); err != nil {
|
||||
return nil, fmt.Errorf("parse dish result: %w", err)
|
||||
}
|
||||
if result.SimilarDishes == nil {
|
||||
result.SimilarDishes = []string{}
|
||||
}
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// ClassifyIngredient uses the text model to classify an unknown food item
|
||||
// and build an ingredient_mappings record for it.
|
||||
func (c *Client) ClassifyIngredient(ctx context.Context, name string) (*IngredientClassification, error) {
|
||||
prompt := fmt.Sprintf(`Классифицируй продукт питания: "%s".
|
||||
|
||||
Ответь ТОЛЬКО валидным JSON без markdown:
|
||||
{
|
||||
"canonical_name": "turkey_breast",
|
||||
"canonical_name_ru": "грудка индейки",
|
||||
"category": "meat",
|
||||
"default_unit": "g",
|
||||
"calories_per_100g": 135,
|
||||
"protein_per_100g": 29,
|
||||
"fat_per_100g": 1,
|
||||
"carbs_per_100g": 0,
|
||||
"storage_days": 3,
|
||||
"aliases": ["грудка индейки", "филе индейки", "turkey breast"]
|
||||
}`, name)
|
||||
|
||||
messages := []map[string]string{
|
||||
{"role": "user", "content": prompt},
|
||||
}
|
||||
text, err := c.generateContent(ctx, messages)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("classify ingredient: %w", err)
|
||||
}
|
||||
|
||||
var result IngredientClassification
|
||||
if err := parseJSON(text, &result); err != nil {
|
||||
return nil, fmt.Errorf("parse classification: %w", err)
|
||||
}
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
// parseJSON strips optional markdown fences and unmarshals JSON.
|
||||
func parseJSON(text string, dst any) error {
|
||||
text = strings.TrimSpace(text)
|
||||
if strings.HasPrefix(text, "```") {
|
||||
text = strings.TrimPrefix(text, "```json")
|
||||
text = strings.TrimPrefix(text, "```")
|
||||
text = strings.TrimSuffix(text, "```")
|
||||
text = strings.TrimSpace(text)
|
||||
}
|
||||
return json.Unmarshal([]byte(text), dst)
|
||||
}
|
||||
Reference in New Issue
Block a user