Memory management enables LLM applications to maintain context across conversations, remember user preferences, and provide personalized experiences. This guide covers implementation patterns using Mem0 with Qdrant vector store.
import streamlit as stfrom mem0 import Memoryfrom litellm import completion# User-specific session managementif "messages" not in st.session_state: st.session_state.messages = []if "previous_user_id" not in st.session_state: st.session_state.previous_user_id = Noneuser_id = st.text_input("Enter your Username")# Clear history on user switchif user_id != st.session_state.previous_user_id: st.session_state.messages = [] st.session_state.previous_user_id = user_idif prompt := st.chat_input("What is your message?"): # Add to chat history st.session_state.messages.append({"role": "user", "content": prompt}) # Store in memory m.add(prompt, user_id=user_id) # Retrieve context from memory memories = m.get_all(user_id=user_id) context = "" if memories and "results" in memories: for memory in memories["results"]: if "memory" in memory: context += f"- {memory['memory']}\n" # Generate response with context response = completion( model="ollama/llama3.1:latest", messages=[ {"role": "system", "content": "You are a helpful assistant with access to past conversations."}, {"role": "user", "content": f"Context: {context}\nCurrent message: {prompt}"} ], api_base="http://localhost:11434", stream=True ) # Store response in memory m.add(f"Assistant: {full_response}", user_id=user_id)
# Add user message to memorymemory.add( "I'm interested in machine learning papers on transformers", user_id="user123", metadata={"timestamp": datetime.now(), "category": "preference"})
# Semantic search through memoriesrelevant_memories = memory.search( "papers about attention mechanisms", user_id="user123", limit=3)for mem in relevant_memories: print(f"- {mem['text']} (relevance: {mem['score']})")
# Get complete memory historyall_memories = memory.get_all(user_id="user123")if "results" in all_memories: for memory in all_memories["results"]: print(f"- {memory['memory']}")
❌ “User had a long conversation about many topics”
Smaller, focused memories enable better retrieval and context building.
Context Window Management
Optimize context usage:
# Retrieve only relevant memoriesmemories = memory.search(current_query, user_id=user_id, limit=3)# Build concise contextcontext = "\n".join([m['text'] for m in memories[:3]])# Don't exceed token limitsif len(context) > 1000: # tokens context = context[:1000]
Privacy & Data Management
User data controls:
# Delete user memoriesmemory.delete_all(user_id="user123")# Export user datauser_data = memory.get_all(user_id="user123")with open(f"{user_id}_memories.json", "w") as f: json.dump(user_data, f)
Performance Optimization
Optimize vector operations:
Use appropriate embedding dimensions (768 for nomic-embed-text)
def rag_with_memory(query, documents, user_id): # Retrieve user preferences and past interactions user_context = memory.search(query, user_id=user_id, limit=3) # Standard RAG retrieval relevant_docs = vector_store.similarity_search(query, k=5) # Combine with memory context enhanced_prompt = f""" User context: {user_context} Relevant documents: {relevant_docs} Query: {query} Provide a personalized response based on user preferences and documents. """ return llm.generate(enhanced_prompt)