Source code for pyfood.utils

import re
import json
import unidecode
import urllib.parse
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, load_npz
from typing import List, Tuple, Dict, Optional


import os

prefix = os.path.dirname(os.path.abspath(__file__))


with open(os.path.join(prefix, "assets", "nutri", "units.json"), "r") as fp:
    units = json.load(fp)

with open(os.path.join(prefix, "assets", "nutri", "nutrimap.json"), "r") as fp:
    nutrimap = json.load(fp)

with open(os.path.join(prefix, "assets", "nutri", "vnr.json"), "r") as fp:
    vnr = json.load(fp)


def lists2object(
    foodnames: List[str], quantities: List[str], unit_list: List[str], taxons: List[str]
) -> List[Dict]:
    """Returns a list of dict from multiple lists."""
    output = []
    for u, v, w, x in zip(foodnames, quantities, unit_list, taxons):
        output.append({"foodname": u, "quantity": v, "unit": w, "taxon": x})
    return output


def str2float(string: str) -> float:
    """Converts a string to a float."""
    try:
        if "/" not in string:
            return float(string)
        else:
            return float(string.split("/")[0]) / float(string.split("/")[1])
    except:
        return 1.0


def str2ngrams(string: str) -> List[str]:
    """Converts a string to a list of ngrams (monogram, bigram, trigram)."""
    ngrams = string.split()  # monograms
    token_list = string.split()
    if len(token_list) > 2:
        ngrams += [
            w1 + " " + w2 + " " + w3
            for w1, w2, w3 in zip(token_list[:-2], token_list[1:-1], token_list[2:])
        ]  # trigrams
    if len(token_list) > 1:
        ngrams += [
            w1 + " " + w2 for w1, w2 in zip(token_list[:-1], token_list[1:])
        ]  # bigrams
    return ngrams


[docs]class Shelf(object): """Shelf object embedded in a given region, month_id and optionally source language"""
[docs] def __init__(self, region: str = "EU", lang_source: str = "un", month_id: int = 0): """Instantiates a shelf from a context (region, lang_source, month_id).""" self.region, self.lang_source, self.month_id = ( region, lang_source.lower(), month_id, ) with open(os.path.join(prefix, "assets", "vocab", "mapping.json"), "r") as fp: mapping = json.load(fp) # food univeral mapping (f2id, id2f) self.mapping = mapping[ self.lang_source ] # load session's language for basket of food (BOW) representation with open(os.path.join(prefix, "assets", "vocab", "feats.json"), "r") as fp: self.feats = json.load( fp ) # food attributes {food_id: {'taxon': , 'fr': , 'es': , 'en': , 'default_weight': , 'default_density': , 'benefits/diet': , nutri...}} self.n_ingredients = len(self.feats) # total number of ingredients self.seasonal_matrix = load_npz( os.path.join( prefix, "assets", "seasons", "sparse_seasons_{}.npz".format(region) ) ) # domain expert knowledge (n_seasons, n_food) varies by 'regions' self.seasonal_vector = np.ravel( self.seasonal_matrix[self.month_id].toarray() ) # local and seasonal ingredients ##### map2producers self.tfidf = None # language model (ngrams2vec) self.model = None # RBM model (revise)
[docs] def get_seasonal_food(self, key: str = "001") -> List[str]: """ Returns what food is in season Parameters ---------- key : str taxon code (fruits: 001, vegetables: 002, ..., mushrooms: 005) Returns ---------- list seasonal_food: Food in seasons in self.region, self.month_id and self.lang_source """ seasonal_food = [] for fid in np.argwhere(self.seasonal_vector > 0.0): feats = self.feats[str(float(fid))] foodname = feats[self.lang_source] taxon = feats["taxon"] if taxon.startswith(key): seasonal_food.append(foodname.capitalize()) seasonal_food = np.sort(seasonal_food) return list(seasonal_food)
def get_urls(self, food_list: List[str]) -> List[str]: return [ self.feats[str(self.mapping["f2id"][foodname.lower()])]["url"] for foodname in food_list ] def get_nutri(self, food_list: List[str], category: str = "minerals") -> List: nutri_list = [] for foodname in food_list: # fid = str(self.mapping['f2id'][foodname.lower()]) # if category == 'minerals': # nutri_tags = self.feats[fid]['top_Minerals'] # else: # nutri_tags = self.feats[fid]['top_Vitamins'] # nutri_list.append(nutri_tags) nutri_list.append([]) return nutri_list def load_tfidf(self): """Loads self.lang_source language model.""" if self.tfidf is None: self.tfidf = pickle.load( open( os.path.join( prefix, "assets", "vocab", "{}/tfidf_{}.pickle".format(self.lang_source, self.lang_source), ), "rb", ) ) # ngram vectorizer for food names self.ngrams = pickle.load( open( os.path.join( prefix, "assets", "vocab", "{}/ngrams_{}.pickle".format( self.lang_source, self.lang_source ), ), "rb", ) )
[docs] def get_food_info(self, food_name: str) -> Tuple: """ Returns food id, taxon and seasonality. Parameters ---------- food_name : str Food name in self.lang_source Returns ---------- food_name : str Food name in self.lang_source fid : float Food id in self.lang_source taxon : str Food taxon (fruits: 001, vegetables: 002, ..., mushrooms: 005) score : float Food score in self.region and self.month_id """ fid = self.mapping["f2id"][ food_name ] # ref id, for example: Reine Claude --> Prune taxon = self.feats[str(fid)]["taxon"] score = self.seasonal_vector[int(fid)] return food_name, fid, taxon, score
[docs] def text2food(self, food_name: str = "apple", threshold: float = 0.0) -> Tuple: """ Retrieves ingredient from vocabulary using `tdfidf` and `cosine_similarity`. Returns food id, taxon and seasonality. Parameters ---------- food_name : str (Noisy) food name in self.lang_source Returns ---------- food_name : str Food name in self.lang_source fid : float Food id in self.lang_source taxon : str Food taxon (fruits: 001, vegetables: 002, ..., mushrooms: 005) score : float Food score in self.region and self.month_id """ if food_name in self.mapping["f2id"]: food_name, fid, taxon, score = self.get_food_info(food_name=food_name) return food_name, fid, taxon, score food_name = urllib.parse.unquote( food_name ) # replace %xx escapes by their single-character equivalent (utf-8 encoding) food_name = ( re.sub(r"\(.*\)", "", food_name).lower().strip() ) # remove parenthesis, lower case and strip string if food_name in self.mapping["f2id"]: food_name, fid, taxon, score = self.get_food_info(food_name=food_name) return food_name, fid, taxon, score food_name_az = unidecode.unidecode( food_name ) # convert unicode with accents to unaccented_string food_name_az = re.sub( "[^a-zA-Z]+", " ", food_name_az ).strip() # replace apostrophees, non alpha characters by spaces and strip string if food_name_az in self.mapping["f2id"]: food_name, fid, taxon, score = self.get_food_info(food_name=food_name_az) return food_name, fid, taxon, score self.load_tfidf() candidates = str2ngrams(food_name) + str2ngrams(food_name_az) # list of ngrams candidates = list(set(candidates)) # remove duplicate ngrams penalty = np.array([[len(c) / len(food_name)] for c in candidates]) affinity = penalty * cosine_similarity( self.tfidf.transform(candidates), self.ngrams, dense_output=True ) # (len(candidates), n_ingredients) neighbors_id = np.argmax(affinity, axis=1) values = np.array([affinity[i][j] for i, j in enumerate(neighbors_id)]) row = np.argmax(values) nearest_neighbor_id = neighbors_id[row] nearest_neighbor_similarity = values[row] if nearest_neighbor_similarity > threshold: food_name = self.mapping["id2f"][str(float(nearest_neighbor_id))] food_name, fid, taxon, score = self.get_food_info(food_name=food_name) return food_name, fid, taxon, score else: return food_name, None, None, None
def convert2g(self, fid: float, qty: float, unit: str) -> float: """Converts a food id, quantity and unit in grams. Parameters ---------- fid : float Food id qty : float Quantity unit : str Unit Returns ---------- weight: float Quantity in grams """ unit = unit.lower() if unit == "default": return qty * self.feats[str(fid)]["default_weight"] # convert to g if unit in units["masse"]: return qty * float(units["masse"][unit]) if unit in units["volume"]: return qty * float(units["volume"][unit]) * self.feats[str(fid)]["density"] if unit in units["misc"]: return ( qty * float(units["misc"][unit]) * self.feats[str(fid)]["default_weight"] ) return qty * self.feats[str(fid)]["default_weight"] def NER( self, food_list: List[str], qty_list: Optional[List[str]] = None, unit_list: Optional[List[str]] = None, lang_dest: Optional[str] = None, filter_HS: bool = True, ) -> Tuple: """Named Entity Recognition on a food (+ optionally quantity and unit) list. Parameters ---------- food_list : list (Noisy) list of food name in self.lang_source qty_list : list List of quantities unit_list : list List of units lang_dest : str Target language (default self.lang_source) Returns ---------- recipe_vector : np.array(self.n_ingredients,1) Vectorized basket of food foodnames : list Denoised food list in self.lang_source qties : list List of quantities (str) units : list List of units (str) taxons : list List of taxons HS : list List of food out of season """ if lang_dest is None: lang_dest = self.lang_source else: lang_dest = lang_dest.lower() if qty_list is None: qty_list = ["100"] * len(food_list) if unit_list is None: unit_list = ["g"] * len(food_list) recipe_vector = np.zeros( (self.n_ingredients, 1) ) # basket of food (n_ingredients,1) foodnames, qties, units, taxons = ( [], [], [], [], ) # extracted entity (length < food_list) HS, HS_id = [], [] # hors saison for (food_, qty_, unit_) in zip(food_list, qty_list, unit_list): # ingredients food_, fid, taxon, score = self.text2food(food_name=food_) if fid is not None: # food_ is in food vocab if lang_dest != self.lang_source: food_ = self.feats[str(fid)][lang_dest] # translate food name qty_ = str2float(qty_) weight = self.convert2g(fid, qty_, unit_) # convert to g recipe_vector[int(fid)] += weight # quantity in g if filter_HS and ( score < 0.5 or taxon.startswith("21") ): # seasonality threshold below 0.5 or non vege ingredient (i.e. fish, meat, egg, seafood) HS.append(food_) HS_id.append(fid) else: qties.append(qty_) units.append(unit_) taxons.append(taxon) foodnames.append(food_) return recipe_vector, foodnames, qties, units, taxons, HS, HS_id
[docs] def process_ingredients( self, food_list: List[str], qty_list: Optional[List[str]] = None, unit_list: Optional[List[str]] = None, lang_dest: Optional[str] = None, revisit: bool = False, infer_nutri: bool = False, serving: int = 1, ): """Labels a list of ingredients, e.g., from a recipe or a basket of food, and saves attributes / labels in self.tags Parameters ---------- food_list : list List of food name in self.lang_source qty_list : list List of quantities or None unit_list : list List of units or None lang_dest : str Target language, default self.lang_source revisit : bool Infer Recipe2BetterRecipe, default False infer_nutri : bool Infer nutrition scores, default False serving : int Number of portions, default 1. Returns ------- tags: dict Extracted ingredients (ingredients_by_taxon, HS, revisited), predicted nutrition (allergies, energy, macro, minerals, vitamines) and labels (vege, vegan, seasonality) """ # read recipe my_recipe_vector, foodnames, qties, unit_list, taxons, HS, HS_id = self.NER( food_list=food_list, qty_list=qty_list, unit_list=unit_list, lang_dest=lang_dest, ) # map ingredients to basket of food self.tags = {} # init tags self.tags["ingredients"] = lists2object( foodnames, qties, unit_list, taxons ) # seasonal and vege ingedients self.tags["HS"] = HS # rest of ingredients self.tags[ "ingredients_by_taxon" ] = {} # dict of list, e.g. 001: [pomme, poire], 101: [sel] for foodname, taxon in zip(foodnames, taxons): if taxon not in self.tags["ingredients_by_taxon"]: self.tags["ingredients_by_taxon"][taxon] = [foodname] else: self.tags["ingredients_by_taxon"][taxon].append(foodname) self.tags["ingredients_by_taxon"] = [ self.tags["ingredients_by_taxon"][taxon] for taxon in np.sort(np.unique(taxons)) ] # list of list, e.g. [ [pomme, poire], [sel] ] # infer seasonality if my_recipe_vector.sum() == 0: self.tags["labels"] = { "vegan": False, "vege": False, "seasonality": {"region": self.region, "score": 0, "best_now": False}, } self.tags["revisited"] = [] if infer_nutri == True: self.tags["nutri"] = { "Allergies": {}, "Energy": {}, "Macro": {}, "Minerals": {}, "Vitamines": {}, } else: my_recipe_vector /= ( my_recipe_vector.sum() ) # normalize ingredients (proportions) for seasonality my_seasons_vector = ( self.seasonal_matrix * csr_matrix(my_recipe_vector) ).toarray() # infer seasonality from seasonal ingredients: (n_seasons,n_ingredients) * (n_ingredients,1) = (n_seasons,1) seasonality = { "city": self.region, "score": int(100 * my_seasons_vector[self.month_id][0]), "best_now": len(HS) == 0, } # infer labels is_vege = ( len( [ 1 for fid in HS_id if self.feats[str(fid)]["taxon"].startswith("21") ] ) == 0 ) # Vegetarian label is_vegan = ( is_vege and len([1 for taxon in taxons if taxon.startswith("2")]) == 0 ) # Vegan label (check no diary and cheese) self.tags["labels"] = { "vegan": is_vegan, "vege": is_vege, "seasonality": seasonality, } # revisit recipe revisited = [] # if revisit==True and (self.lang_source==lang_dest or lang_dest==None): # # revisited = self.sample(my_recipe_vector, nsamples=len(HS)) ##### # revisited = [] # for hs, hs_id in zip(HS, HS_id): # taxon constraint # hs_taxon = self.feats[str(hs_id)]['taxon'] # ideas = self.harmonize(food=hs, nsamples=4) # for idea in ideas: # revisited.append(idea) self.tags["revisited"] = list(set(revisited)) ##### Infer nutriscores [HEALTH] # infer nutriscores and VNR if infer_nutri == True: nutrition = {"Energy": {}, "Macro": {}, "Minerals": {}, "Vitamins": {}} nutriscores = load_npz( os.path.join(prefix, "assets", "nutri", "sparse_nutriscores.npz") ) # load nutriscores matrix my_nutri_vector = (nutriscores * csr_matrix(my_recipe_vector)).toarray() my_nutri_vector = my_nutri_vector / float(serving) for c in ["Energy", "Macro", "Minerals", "Vitamins"]: for nutri_name, nutri_id in nutrimap[c].items(): composcore = ( 100 * float(my_nutri_vector[nutri_id]) / vnr[c][nutri_name] ) # infer kCal (energy), macro scores, minerals scores and vitamines scores (% VNR) + Allergies (0 ou 1) if composcore > 0: nutrition[c][nutri_name] = composcore self.tags["nutri"] = nutrition return self.tags