-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
using_onnx_models.py
84 lines (49 loc) · 2.66 KB
/
using_onnx_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
""" Starting with llmware 0.3.7, we have integrated support for ONNX Runtime Generative models.
To get started:
`pip install onnxruntime_genai`
Please note that onnxruntime_genai is supported on a wide range of Windows, Linux and x86 platforms, but
does not build for Mac Metal - so it will not work on Macs.
"""
from llmware.models import ModelCatalog
from importlib import util
if not util.find_spec("onnxruntime_genai"):
print("\nto run this example, you need to install onnxruntime_genai first, e.g., pip3 install onnxruntime_genai")
# we will be adding more ONNX models to the default catalog, but we currently support:
# -- bling-tiny-llama-onnx
# -- bling-phi-3-onnx
# -- phi-3-onnx
# please see the example 'adding_openvino_or_onnx_model.py' to add your own ONNX and OpenVino models
def getting_started():
""" Simple 'hello world' example. """
model = ModelCatalog().load_model("bling-tiny-llama-onnx", temperature=0.0, sample=False,
max_output=100)
query= "What was Microsoft's revenue in the 3rd quarter?"
context = ("Microsoft Cloud Strength Drives Third Quarter Results \nREDMOND, Wash. — April 25, 2023 — "
"Microsoft Corp. today announced the following results for the quarter ended March 31, 2023,"
" as compared to the corresponding period of last fiscal year:\n· Revenue was $52.9 billion"
" and increased 7% (up 10% in constant currency)\n· Operating income was $22.4 billion "
"and increased 10% (up 15% in constant currency)\n· Net income was $18.3 billion and "
"increased 9% (up 14% in constant currency)\n· Diluted earnings per share was $2.45 "
"and increased 10% (up 14% in constant currency).\n")
response = model.inference(query,add_context=context)
print(f"\ngetting_started example - query - {query}")
print("getting_started example - response: ", response)
return response
def streaming_example():
prompt = "What are the benefits of small specialized LLMs?"
print(f"\nstreaming_example - prompt: {prompt}")
# since model.stream provides a generator, then use as follows to consume the generator
model = ModelCatalog().load_model("phi-3-onnx", max_output=500)
text_out = ""
token_count = 0
for streamed_token in model.stream(prompt):
text_out += streamed_token
if text_out.strip():
print(streamed_token, end="")
token_count += 1
print("total text: ", text_out)
print("total tokens: ", token_count)
return text_out
if __name__ == "__main__":
getting_started()
streaming_example()