fineinstructions
/

template_instantiator

Text Generation

datadreamer-0.46.0

Model card Files Files and versions

AjayP13 commited on Apr 22

Commit

921e2f9

·

verified ·

1 Parent(s): bf6a2f3

Update README.md

Files changed (1) hide show

README.md +33 -2

README.md CHANGED Viewed

@@ -17,8 +17,33 @@ The output will be a JSON object.
 ```python
 import json
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained('fineinstructions/template_instantiator', revision=None)
 tokenizer.padding_side = 'left'
@@ -33,9 +58,15 @@ inputs = [json.dumps({
 prompts = [tokenizer.apply_chat_template([{'role': 'user', 'content': i}], tokenize=False, add_generation_prompt=True) for i in inputs]
 generations = pipe(prompts, max_length=131072, truncation=True, temperature=None, top_p=None, do_sample=False)
 output = generations[0][0]['generated_text']
-print(output)
-##### Output:
 # {
 # ..
 # }

 ```python
 import json
+import re
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+# Helper to expand excerpts in the answer
+def expand(document, text):
+    excerpt_pattern = r"<excerpt>(.*?)<\.\.\.>(.*?)</excerpt>"
+    matches = re.findall(excerpt_pattern, text, flags=re.DOTALL)
+    replacements = {}
+    for prefix, suffix in matches:
+        match = re.search(
+            re.escape(prefix) + r" (.*?) " + re.escape(suffix),
+            document,
+            flags=re.DOTALL,
+        )
+        try:
+            if match:
+                replacements[f"<excerpt>{prefix}<...>{suffix}</excerpt>"] = match.group(
+                    0
+                )
+            else:
+                return None
+        except Exception:
+            return None
+    for old, new in replacements.items():
+        text = text.replace(old, new)
+    return text
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained('fineinstructions/template_instantiator', revision=None)
 tokenizer.padding_side = 'left'
 prompts = [tokenizer.apply_chat_template([{'role': 'user', 'content': i}], tokenize=False, add_generation_prompt=True) for i in inputs]
 generations = pipe(prompts, max_length=131072, truncation=True, temperature=None, top_p=None, do_sample=False)
 output = generations[0][0]['generated_text']
+output_json = json.loads()
+# Expand the answer
+output_json["answer"] = expand(document=inputs[0]["document"], answer=output_json["answer"])
+# Print the output JSON
+print(output_json)
+##### Output JSON:
 # {
 # ..
 # }