diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index a294ebf8a..df54e0326 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -96,5 +96,8 @@ def parse_bool_arg(arg): app = create_app(settings=settings) uvicorn.run( - app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) + app, + host=os.getenv("HOST", settings.host), + port=int(os.getenv("PORT", settings.port)), + limit_concurrency=int(os.getenv("LIMIT_CONCURRENCY", settings.limit_concurrency)) ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index bec956147..031f0560a 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -157,6 +157,10 @@ class Settings(BaseSettings): ) # Server Params host: str = Field(default="localhost", description="Listen address") + limit_concurrency: int = Field( + default=2, + description="Number of concurrent processes + 1" + ) port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( default=True,