JRosenkranz commited on
Commit
005b255
·
verified ·
1 Parent(s): e1d9017

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +54 -6
README.md CHANGED
@@ -40,17 +40,45 @@ _Note: For all samples, your environment must have access to cuda_
40
  #### Setup
41
 
42
  ```bash
43
- docker pull quay.io/wxpe/text-gen-server:main.ee927a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  docker run -d --rm --gpus all \
45
  --name my-tgis-server \
46
  -p 8033:8033 \
47
- -v /path/to/all/models:/models \
48
- -e MODEL_NAME=/models/model_weights/llama/CodeLlama-13b-Instruct-hf \
49
- -e SPECULATOR_NAME=/models/speculator_weights/llama/codellama-13b-accelerator \
 
 
50
  -e FLASH_ATTENTION=true \
51
  -e PAGED_ATTENTION=true \
52
- -e DTYPE_STR=float16 \
53
- quay.io/wxpe/text-gen-server:main.ee927a4
54
 
55
  # check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
56
  docker logs my-tgis-server -f
@@ -74,6 +102,26 @@ _Note: first prompt may be slower as there is a slight warmup time_
74
 
75
  ### Minimal Sample
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  #### Install
78
 
79
  ```bash
 
40
  #### Setup
41
 
42
  ```bash
43
+ HF_HUB_CACHE=/hf_hub_cache
44
+ HF_HUB_TOKEN="your huggingface hub token"
45
+ TGIS_IMAGE=quay.io/wxpe/text-gen-server:main.ee927a4
46
+
47
+ docker pull $TGIS_IMAGE
48
+
49
+ # optionally download CodeLlama-13b-Instruct-hf if the weights do not already exist
50
+ docker run --rm \
51
+ -v $HF_HUB_CACHE:/models \
52
+ -e HF_HUB_CACHE=/models \
53
+ -e TRANSFORMERS_CACHE=/models \
54
+ $TGIS_IMAGE \
55
+ text-generation-server download-weights \
56
+ codellama/CodeLlama-13b-Instruct-hf \
57
+ --token $HF_HUB_TOKEN
58
+
59
+ # optionally download the speculator model if the weights do not already exist
60
+ docker run --rm \
61
+ -v $HF_HUB_CACHE:/models \
62
+ -e HF_HUB_CACHE=/models \
63
+ -e TRANSFORMERS_CACHE=/models \
64
+ $TGIS_IMAGE \
65
+ text-generation-server download-weights \
66
+ ibm-fms/codellama-13b-accelerator \
67
+ --token $HF_HUB_TOKEN
68
+
69
+ # note: if the weights were downloaded separately (not with the above commands), please place them in the HF_HUB_CACHE directoy and refer to them with /models/<model_name>
70
  docker run -d --rm --gpus all \
71
  --name my-tgis-server \
72
  -p 8033:8033 \
73
+ -v $HF_HUB_CACHE:/models \
74
+ -e HF_HUB_CACHE=/models \
75
+ -e TRANSFORMERS_CACHE=/models \
76
+ -e MODEL_NAME=codellama/CodeLlama-13b-Instruct-hf \
77
+ -e SPECULATOR_NAME=ibm-fms/codellama-13b-accelerator \
78
  -e FLASH_ATTENTION=true \
79
  -e PAGED_ATTENTION=true \
80
+ -e DTYPE=float16 \
81
+ $TGIS_IMAGE
82
 
83
  # check logs and wait for "gRPC server started on port 8033" and "HTTP server started on port 3000"
84
  docker logs my-tgis-server -f
 
102
 
103
  ### Minimal Sample
104
 
105
+ *To try this out with the fms-native compiled model, please execute the following:*
106
+
107
+ #### Install
108
+
109
+ ```bash
110
+ git clone https://github.com/foundation-model-stack/fms-extras
111
+ (cd fms-extras && pip install -e .)
112
+ pip install transformers==4.35.0 sentencepiece numpy
113
+ ```
114
+
115
+ #### Run Sample
116
+
117
+ ```bash
118
+ python sample_client.py
119
+ ```
120
+
121
+ _Note: first prompt may be slower as there is a slight warmup time_
122
+
123
+ ### Minimal Sample
124
+
125
  #### Install
126
 
127
  ```bash