File size: 19,647 Bytes

27ba5c5

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "64f33f31-f533-41e8-9821-940a5d2ea343",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Inspecting file: model-00001-of-00004.safetensors\n",
      "Available keys (tensor names):\n",
      "\n",
      "Key: model.layers.0.input_layernorm.weight\n",
      "  Shape: torch.Size([3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 3584 elements\n",
      "  First few elements: [0.275390625, 0.3046875, 0.26171875, 0.291015625, 0.29296875]\n",
      "\n",
      "Key: model.layers.0.mlp.down_proj.weight\n",
      "  Shape: torch.Size([3584, 18944])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 67895296 elements\n",
      "  First few elements: [-0.005096435546875, 0.01385498046875, 0.0096435546875, -0.00848388671875, -0.002593994140625]\n",
      "\n",
      "Key: model.layers.0.mlp.gate_proj.weight\n",
      "  Shape: torch.Size([18944, 3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 67895296 elements\n",
      "  First few elements: [0.00286865234375, -0.0201416015625, -0.0216064453125, 0.006622314453125, -0.015625]\n",
      "\n",
      "Key: model.layers.0.mlp.up_proj.weight\n",
      "  Shape: torch.Size([18944, 3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 67895296 elements\n",
      "  First few elements: [0.007537841796875, -0.0111083984375, -0.0024261474609375, -0.006927490234375, -0.02587890625]\n",
      "\n",
      "Key: model.layers.0.post_attention_layernorm.weight\n",
      "  Shape: torch.Size([3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 3584 elements\n",
      "  First few elements: [0.28515625, 0.33203125, 0.259765625, 0.236328125, 0.296875]\n",
      "\n",
      "Key: model.layers.0.self_attn.kv_a_proj_with_mqa.bias\n",
      "  Shape: torch.Size([576])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 576 elements\n",
      "  First few elements: [3.953125, 0.0634765625, 1.2578125, -1.515625, 0.29296875]\n",
      "\n",
      "Key: model.layers.0.self_attn.kv_a_proj_with_mqa.weight\n",
      "  Shape: torch.Size([576, 3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 2064384 elements\n",
      "  First few elements: [0.0322265625, -0.005157470703125, -0.03173828125, 0.0184326171875, -0.015625]\n",
      "\n",
      "Key: model.layers.0.self_attn.kv_b_proj.weight\n",
      "  Shape: torch.Size([7168, 512])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 3670016 elements\n",
      "  First few elements: [-0.04931640625, -0.01904296875, 0.080078125, -0.01324462890625, 0.0179443359375]\n",
      "\n",
      "Key: model.layers.0.self_attn.o_proj.weight\n",
      "  Shape: torch.Size([3584, 3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 12845056 elements\n",
      "  First few elements: [0.00323486328125, -0.030029296875, -0.0069580078125, 0.0089111328125, 0.007568359375]\n",
      "\n",
      "Key: model.layers.0.self_attn.q_proj.bias\n",
      "  Shape: torch.Size([5376])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 5376 elements\n",
      "  First few elements: [0.5, 2.140625, -0.98046875, 1.3671875, 1.015625]\n",
      "\n",
      "Key: model.layers.0.self_attn.q_proj.weight\n",
      "  Shape: torch.Size([5376, 3584])\n",
      "  Dtype: torch.bfloat16\n",
      "  Size: 19267584 elements\n",
      "  First few elements: [-0.0003452301025390625, -0.005340576171875, 0.021484375, 0.003997802734375, -0.00274658203125]\n"
     ]
    }
   ],
   "source": [
    "from safetensors import safe_open\n",
    "\n",
    "def inspect_safetensors(file_path):\n",
    "    print(f\"Inspecting file: {file_path}\")\n",
    "    with safe_open(file_path, framework=\"pt\", device=\"cpu\") as f:\n",
    "        print(\"Available keys (tensor names):\")\n",
    "        for key in f.keys():\n",
    "            if 'model.layers.0' in key:\n",
    "                \n",
    "                tensor = f.get_tensor(key)\n",
    "                print(f\"\\nKey: {key}\")\n",
    "                print(f\"  Shape: {tensor.shape}\")\n",
    "                print(f\"  Dtype: {tensor.dtype}\")\n",
    "                print(f\"  Size: {tensor.numel()} elements\")\n",
    "                # 可选：显示前几个元素\n",
    "                print(f\"  First few elements: {tensor.flatten()[:5].tolist()}\")\n",
    "\n",
    "# 示例路径，请替换为你自己的 .safetensors 文件路径\n",
    "file_path = \"model-00001-of-00004.safetensors\"\n",
    "inspect_safetensors(file_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "2a331ae6-f4ca-4d16-8bbf-36f39b9ac43e",
   "metadata": {},
   "outputs": [
    {
     "ename": "Exception",
     "evalue": "data did not match any variant of untagged enum ModelWrapper at line 757455 column 3",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mException\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 8\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;66;03m# model setting\u001b[39;00m\n\u001b[1;32m      6\u001b[0m model_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m./\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 8\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m      9\u001b[0m model \u001b[38;5;241m=\u001b[39m VideoChatFlashQwenForCausalLM\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_path)\u001b[38;5;241m.\u001b[39mto(torch\u001b[38;5;241m.\u001b[39mbfloat16)\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[1;32m     10\u001b[0m image_processor \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mget_vision_tower()\u001b[38;5;241m.\u001b[39mimage_processor\n",
      "File \u001b[0;32m/opt/conda/envs/vcflash/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:837\u001b[0m, in \u001b[0;36mAutoTokenizer.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m    833\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m tokenizer_class \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    834\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    835\u001b[0m             \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTokenizer class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtokenizer_class_candidate\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not exist or is not currently imported.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    836\u001b[0m         )\n\u001b[0;32m--> 837\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtokenizer_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    839\u001b[0m \u001b[38;5;66;03m# Otherwise we have to be creative.\u001b[39;00m\n\u001b[1;32m    840\u001b[0m \u001b[38;5;66;03m# if model is an encoder decoder, the encoder tokenizer class is used by default\u001b[39;00m\n\u001b[1;32m    841\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config, EncoderDecoderConfig):\n",
      "File \u001b[0;32m/opt/conda/envs/vcflash/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2086\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m   2083\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2084\u001b[0m         logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mloading file \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m from cache at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresolved_vocab_files[file_id]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2086\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_from_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2087\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresolved_vocab_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2088\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2089\u001b[0m \u001b[43m    \u001b[49m\u001b[43minit_configuration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2090\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2091\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2092\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2093\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2094\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_commit_hash\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_hash\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2095\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_is_local\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_local\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2096\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2097\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2098\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/vcflash/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2325\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase._from_pretrained\u001b[0;34m(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)\u001b[0m\n\u001b[1;32m   2323\u001b[0m \u001b[38;5;66;03m# Instantiate the tokenizer.\u001b[39;00m\n\u001b[1;32m   2324\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2325\u001b[0m     tokenizer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minit_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2326\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[1;32m   2327\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m(\n\u001b[1;32m   2328\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to load vocabulary from file. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   2329\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease check that the provided vocabulary is accessible and not corrupted.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   2330\u001b[0m     )\n",
      "File \u001b[0;32m/opt/conda/envs/vcflash/lib/python3.10/site-packages/transformers/models/qwen2/tokenization_qwen2_fast.py:129\u001b[0m, in \u001b[0;36mQwen2TokenizerFast.__init__\u001b[0;34m(self, vocab_file, merges_file, tokenizer_file, unk_token, bos_token, eos_token, pad_token, **kwargs)\u001b[0m\n\u001b[1;32m    118\u001b[0m unk_token \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    119\u001b[0m     AddedToken(unk_token, lstrip\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, rstrip\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, special\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, normalized\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m    120\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(unk_token, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    121\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m unk_token\n\u001b[1;32m    122\u001b[0m )\n\u001b[1;32m    123\u001b[0m pad_token \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    124\u001b[0m     AddedToken(pad_token, lstrip\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, rstrip\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, special\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, normalized\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m    125\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(pad_token, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m    126\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m pad_token\n\u001b[1;32m    127\u001b[0m )\n\u001b[0;32m--> 129\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m    130\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvocab_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    131\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmerges_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    132\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtokenizer_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtokenizer_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[43m    \u001b[49m\u001b[43munk_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43munk_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbos_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbos_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    135\u001b[0m \u001b[43m    \u001b[49m\u001b[43meos_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meos_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    136\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpad_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpad_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    137\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/envs/vcflash/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:111\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast.__init__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    108\u001b[0m     fast_tokenizer \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(tokenizer_object)\n\u001b[1;32m    109\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m fast_tokenizer_file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m from_slow:\n\u001b[1;32m    110\u001b[0m     \u001b[38;5;66;03m# We have a serialization from tokenizers which let us directly build the backend\u001b[39;00m\n\u001b[0;32m--> 111\u001b[0m     fast_tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mTokenizerFast\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfast_tokenizer_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    112\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m slow_tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    113\u001b[0m     \u001b[38;5;66;03m# We need to convert a slow tokenizer to build the backend\u001b[39;00m\n\u001b[1;32m    114\u001b[0m     fast_tokenizer \u001b[38;5;241m=\u001b[39m convert_slow_tokenizer(slow_tokenizer)\n",
      "\u001b[0;31mException\u001b[0m: data did not match any variant of untagged enum ModelWrapper at line 757455 column 3"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModel, AutoTokenizer\n",
    "import torch\n",
    "from modeling_videochat_flash import VideoChatFlashQwenForCausalLM\n",
    "\n",
    "# model setting\n",
    "model_path = './'\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
    "model = VideoChatFlashQwenForCausalLM.from_pretrained(model_path).to(torch.bfloat16).cuda()\n",
    "image_processor = model.get_vision_tower().image_processor\n",
    "\n",
    "mm_llm_compress = False # use the global compress or not\n",
    "if mm_llm_compress:\n",
    "    model.config.mm_llm_compress = True\n",
    "    model.config.llm_compress_type = \"uniform0_attention\"\n",
    "    model.config.llm_compress_layer_list = [4, 18]\n",
    "    model.config.llm_image_token_ratio_list = [1, 0.75, 0.25]\n",
    "else:\n",
    "    model.config.mm_llm_compress = False\n",
    "\n",
    "# evaluation setting\n",
    "max_num_frames = 512\n",
    "generation_config = dict(\n",
    "    do_sample=False,\n",
    "    temperature=0.0,\n",
    "    max_new_tokens=1024,\n",
    "    top_p=0.1,\n",
    "    num_beams=1\n",
    ")\n",
    "\n",
    "video_path = \"test.mp4\"\n",
    "\n",
    "# single-turn conversation\n",
    "question1 = \"Describe this video in detail.\"\n",
    "output1, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question1, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config)\n",
    "\n",
    "print(output1)\n",
    "\n",
    "# # multi-turn conversation\n",
    "# question2 = \"How many people appear in the video?\"\n",
    "# output2, chat_history = model.chat(video_path=video_path, tokenizer=tokenizer, user_prompt=question2, chat_history=chat_history, return_history=True, max_num_frames=max_num_frames, generation_config=generation_config)\n",
    "\n",
    "# print(output2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bfd7429-5d77-42ad-8e3f-9cf37b25dfc7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vcflash",
   "language": "python",
   "name": "vcflash"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}