import json
import os
import re
import requests
from transformers import AutoTokenizer
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool, QueryEngineTool, BaseTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from llama_index.llms.openai_like import OpenAILike
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import nest_asyncio
apply()
nest_asyncio.
from dotenv import load_dotenv
; load_dotenv()
This is an agent capable of function calling in order to answer user’s questions about a document. It’s powered by Mixtral 8x22B MoE model.
Configure default LLM and embedding model for LlamaIndex
def get_model_name():
= os.environ['BASE_URL']
BASE_URL = {
headers 'accept': 'application/json',
'x-api-key': os.environ['API_KEY']
}= requests.get(os.path.join(BASE_URL, 'model'), headers=headers).json()
res if 'id' not in res:
raise Exception('Model not loaded.')
return res['id']
= get_model_name()
model_name print(model_name)
= OpenAILike(
llm =model_name,
model=os.environ['BASE_URL'],
api_base=os.environ['API_KEY']
api_key
)= AutoTokenizer.from_pretrained('mistralai/Mixtral-8x22B-Instruct-v0.1')
tokenizer
= HuggingFaceEmbedding(
embed_model 'BAAI/bge-base-en-v1.5',
=os.environ['HF_CACHE_DIR'])
cache_folder
= llm
Settings.llm = embed_model Settings.embed_model
Mixtral-8x22B-Instruct-v0.1-exl2-4.0bpw
/home/jovyan/.local/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
warnings.warn(
Define a function calling custom agent
def format_prompt(messages, tokenizer, use_tool=False, tools=None):
if use_tool:
if tools is None or len(tools)==0:
raise Exception('A list of tools is required for function calling.')
= tokenizer.apply_chat_template(
prompt
messages,='tool_use',
chat_template=json.dumps(tools),
tools=False,
tokenize=True)
add_generation_promptelse:
= tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompt return prompt
class MixtralFunctionCallingAgent:
def __init__(
self,
llm: OpenAILike,
tokenizer: AutoTokenizer,list[tuple[BaseTool, dict]],
tool_param_desc_pairs: list[dict] = []
initial_memory: -> None:
) self._llm = llm
self._tokenizer = tokenizer
self._initial_memory = initial_memory.copy()
self._memory = initial_memory.copy()
self._tool_specs = []
self._tools = {}
for tool, param_desc in tool_param_desc_pairs:
= self._get_fn_tool_spec(tool, param_desc)
tool_spec self._tool_specs.append(tool_spec)
self._tools[tool.metadata.name] = tool
def reset(self) -> None:
self._memory = self._initial_memory.copy()
def chat(self, message: str) -> str:
self._memory.append({'role': 'user', 'content': message})
while True:
= self._run_step()
response, prompt
print('----------DEBUG----------')
print(response)
print('-------------------------\n')
= self._get_first_toolcall(response)
toolcall if toolcall:
self._single_tool_call(toolcall)
else:
self._memory.append({'role': 'assistant', 'content': response})
return response, prompt
def _run_step(self):
= format_prompt(self._memory, self._tokenizer, True, self._tool_specs)
prompt = self._llm.complete(prompt, formatted=True).text.strip()
response return response, prompt
def _get_first_toolcall(self, response: str) -> dict|None:
= re.findall('(\{\s*"name":.*?\}\})+', response)
m if len(m) > 0:
try:
= json.loads(m[0])
toolcalls return toolcalls
except:
return None
else:
return None
def _single_tool_call(self, tool_call: dict) -> str:
self._memory.append(
'role': 'tool_calls', 'content': json.dumps([tool_call], ensure_ascii=False)}
{
)
= self._call_tool(tool_call)
tool_results self._memory.append(
'role': 'tool_results', 'content': json.dumps({'content': tool_results}, ensure_ascii=False)}
{
)
def _call_tool(self, tool_call: dict) -> str:
= self._tools[tool_call['name']]
tool = tool(**tool_call['arguments'])
results return results.content.strip()
def _get_fn_tool_spec(self, fn_tool, tool_param_descriptions):
= fn_tool.metadata.to_openai_tool()
fn_tool_spec for k, v in fn_tool_spec['function']['parameters']['properties'].items():
'title', None)
v.pop('description'] = tool_param_descriptions[k]
v[return fn_tool_spec
Create indexes for documents
= SimpleDirectoryReader(
documents =["MoRA: High-Rank Updating for Parameter-Efficient Fine-Tuning.pdf"]
input_files
).load_data()
= SentenceSplitter(chunk_size=1024)
splitter = splitter.get_nodes_from_documents(documents)
nodes
= VectorStoreIndex(nodes)
vector_index = SummaryIndex(nodes) summary_index
def vector_query(query: str, page_numbers: list[str] | None = None) -> str:
"""Useful for answering questions about the MoRA paper.
Always leave page_numbers as None unless you want to search for a specific page.
Args:
query (str): the string query to be embedded.
page_numbers (List[str] | None): Filter by set of pages. Leave as NONE if we want to perform a vector search over all pages. Otherwise, filter by the set of specified pages."""
= page_numbers or []
page_numbers = [
metadata_dicts "key": "page_label", "value": p} for p in page_numbers
{
]
= vector_index.as_query_engine(
query_engine =2,
similarity_top_k=MetadataFilters.from_dicts(
filters
metadata_dicts,=FilterCondition.OR
condition
)
)= query_engine.query(query)
response return response
= {
vector_tool_param_descriptions 'query': 'the string query to be embedded.',
'page_numbers': 'Filter by set of pages. Leave as NONE if we want to perform a vector search over all pages. Otherwise, filter by the set of specified pages.'
}= FunctionTool.from_defaults(name='vector_tool_mora', fn=vector_query)
vector_tool
def summary_query(query: str) -> str:
"""Use ONLY IF you want to get a holistic summary of MoRA. Avoid it if you have specific questions over MoRA.
Args:
query (str): the string query to be embedded."""
= summary_index.as_query_engine(
summary_query_engine ='tree_summarize',
response_mode=True
use_async
)= summary_query_engine.query(query)
response return response.response
= {'query': 'the string query to be embedded.'}
summary_tool_param_descriptions = FunctionTool.from_defaults(name='summary_tool_mora', fn=summary_query)
summary_tool
= [
tool_param_desc_pairs
(vector_tool, vector_tool_param_descriptions),
(summary_tool, summary_tool_param_descriptions), ]
Query the document
= MixtralFunctionCallingAgent(
agent =llm,
llm=tokenizer,
tokenizer=tool_param_desc_pairs,
tool_param_desc_pairs )
= agent.chat("What are the main contributions of the MoRA paper?")
response, _ print(response)
----------DEBUG----------
[{"name": "summary_tool_mora", "arguments": {"query": "What are the main contributions of the MoRA paper?"}}]
-------------------------
----------DEBUG----------
The main contributions of the MoRA paper are:
1. Introducing MoRA, a method that utilizes non-parameterized operators for high-rank updating to overcome the limitations of low-rank updating through LoRA.
2. Exploring various methods to implement decompresion and compression functions within the MoRA framework.
3. Performance comparisons indicating that MoRA matches LoRA in instruction tuning and mathematical reasoning, and exhibits superior performance in continual pretraining and memory tasks.
4. Conducting pretraining experiments to further demonstrate the effectiveness of high-rank updating and showing superior results compared to ReLoRA.
5. Proposing a novel method for parameter-efficient fine-tuning of large-scale pre-trained models, which is based on high-rank updating.
6. Introducing the concept of ReMoRA, which is a generalization of LoRA that allows for more flexible and efficient fine-tuning.
7. Providing a detailed implementation of ReMoRA in pretraining, including the definition of two kinds of grouping functions and the use of compression and decompression functions.
8. Evaluating the performance of MoRA on various downstream tasks, including biomedical and financial tasks, and showing that it outperforms existing methods such as LoRA and FFT.
9. Providing an analysis of the computational complexity of MoRA and showing that it is more efficient than existing methods in terms of both time and space complexity.
10. Providing an ablation study to demonstrate the effectiveness of each component of MoRA.
11. Providing a comparison with other parameter-efficient fine-tuning methods and showing that MoRA outperforms them in terms of both accuracy and efficiency.
12. Providing a discussion on the limitations of MoRA and future work.
-------------------------
The main contributions of the MoRA paper are:
1. Introducing MoRA, a method that utilizes non-parameterized operators for high-rank updating to overcome the limitations of low-rank updating through LoRA.
2. Exploring various methods to implement decompresion and compression functions within the MoRA framework.
3. Performance comparisons indicating that MoRA matches LoRA in instruction tuning and mathematical reasoning, and exhibits superior performance in continual pretraining and memory tasks.
4. Conducting pretraining experiments to further demonstrate the effectiveness of high-rank updating and showing superior results compared to ReLoRA.
5. Proposing a novel method for parameter-efficient fine-tuning of large-scale pre-trained models, which is based on high-rank updating.
6. Introducing the concept of ReMoRA, which is a generalization of LoRA that allows for more flexible and efficient fine-tuning.
7. Providing a detailed implementation of ReMoRA in pretraining, including the definition of two kinds of grouping functions and the use of compression and decompression functions.
8. Evaluating the performance of MoRA on various downstream tasks, including biomedical and financial tasks, and showing that it outperforms existing methods such as LoRA and FFT.
9. Providing an analysis of the computational complexity of MoRA and showing that it is more efficient than existing methods in terms of both time and space complexity.
10. Providing an ablation study to demonstrate the effectiveness of each component of MoRA.
11. Providing a comparison with other parameter-efficient fine-tuning methods and showing that MoRA outperforms them in terms of both accuracy and efficiency.
12. Providing a discussion on the limitations of MoRA and future work.
Reset the agent’s memory for a new chat session.
agent.reset()
= agent.chat("What are the main results of MoRA described on page 7?")
response, _ print(response)
----------DEBUG----------
[{"name": "vector_tool_mora", "arguments": {"query": "What are the main results of MoRA described on page 7?", "page_numbers": ["7"]}}]
-------------------------
----------DEBUG----------
According to the results described on page 7 of MoRA, the model shows on par performances with LoRA on instruction tuning and mathematical reasoning. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training. LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256. Different tasks show different requirements for fine-tuning capabilities. For instruction tuning, rank 8 is enough to achieve performance similar to FFT. For mathematical reasoning, rank 8 is unable to match FFT performance. However, increasing the rank from 8 to 256 can eliminate the performance gap. For continual pretraining, LoRA with rank 256 still underperforms FFT.
-------------------------
According to the results described on page 7 of MoRA, the model shows on par performances with LoRA on instruction tuning and mathematical reasoning. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training. LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256. Different tasks show different requirements for fine-tuning capabilities. For instruction tuning, rank 8 is enough to achieve performance similar to FFT. For mathematical reasoning, rank 8 is unable to match FFT performance. However, increasing the rank from 8 to 256 can eliminate the performance gap. For continual pretraining, LoRA with rank 256 still underperforms FFT.
= agent.chat("How do MoRA compared with LoRA?")
response, _ print(response)
----------DEBUG----------
[{"name": "vector_tool_mora", "arguments": {"query": "How do MoRA compared with LoRA?"}}]
-------------------------
----------DEBUG----------
According to the results, MoRA shows on par performances with LoRA on instruction tuning and mathematical reasoning. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training. However, LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256.
-------------------------
According to the results, MoRA shows on par performances with LoRA on instruction tuning and mathematical reasoning. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training. However, LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256.
= agent.chat("What are the main points of High-rank Updating on page 8?")
response, _ print(response)
----------DEBUG----------
[{"name": "vector_tool_mora", "arguments": {"query": "What are the main points of High-rank Updating on page 8?", "page_numbers": ["8"]}}]
-------------------------
----------DEBUG----------
According to the main points of High-rank Updating on page 8, the impact of high-rank updating on the rank of ∆W was demonstrated by analyzing the spectrum of singular values for the learned ∆W on a 250M pretraining model. MoRA and ReMoRA exhibit a substantially higher number of significant singular values compared to LoRA and ReLoRA, highlighting the effectiveness of these methods in increasing the rank of ∆W. The quantity of singular values shown in Figure 5 can be correlated with the perplexity metrics listed in Table 3. MoRA, without the merge-and-reint strategy in ReLoRA and ReMoRA, can achieve a lower perplexity than ReLoRA along with a higher significant singular values.
-------------------------
According to the main points of High-rank Updating on page 8, the impact of high-rank updating on the rank of ∆W was demonstrated by analyzing the spectrum of singular values for the learned ∆W on a 250M pretraining model. MoRA and ReMoRA exhibit a substantially higher number of significant singular values compared to LoRA and ReLoRA, highlighting the effectiveness of these methods in increasing the rank of ∆W. The quantity of singular values shown in Figure 5 can be correlated with the perplexity metrics listed in Table 3. MoRA, without the merge-and-reint strategy in ReLoRA and ReMoRA, can achieve a lower perplexity than ReLoRA along with a higher significant singular values.
Take a look at chat history
agent._memory
[{'role': 'user',
'content': 'What are the main results of MoRA described on page 7?'},
{'role': 'tool_calls',
'content': '[{"name": "vector_tool_mora", "arguments": {"query": "What are the main results of MoRA described on page 7?", "page_numbers": ["7"]}}]'},
{'role': 'tool_results',
'content': '{"content": "MoRA shows on par performances with LoRA on instruction tuning and mathematical reasoning. Benefit from high-rank updating to memorize new knowledge, MoRA outperforms LoRA on both biomedical and financial domains for continual pre-training. LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256. Different tasks show different requirements for fine-tuning capabilities. For instruction tuning, rank 8 is enough to achieve performance similar to FFT. For mathematical reasoning, rank 8 is unable to match FFT performance. However, increasing the rank from 8 to 256 can eliminate the performance gap. For continual pretraining, LoRA with rank 256 still underperforms FFT."}'},
{'role': 'assistant',
'content': 'According to the results described on page 7 of MoRA, the model shows on par performances with LoRA on instruction tuning and mathematical reasoning. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training. LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256. Different tasks show different requirements for fine-tuning capabilities. For instruction tuning, rank 8 is enough to achieve performance similar to FFT. For mathematical reasoning, rank 8 is unable to match FFT performance. However, increasing the rank from 8 to 256 can eliminate the performance gap. For continual pretraining, LoRA with rank 256 still underperforms FFT.'},
{'role': 'user', 'content': 'How do MoRA compared with LoRA?'},
{'role': 'tool_calls',
'content': '[{"name": "vector_tool_mora", "arguments": {"query": "How do MoRA compared with LoRA?"}}]'},
{'role': 'tool_results',
'content': '{"content": "MoRA shows on par performances with LoRA on instruction tuning and mathematical reasoning. Benefit from high-rank updating to memorize new knowledge, MoRA outperforms LoRA on both biomedical and financial domains for continual pretraining. However, LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256."}'},
{'role': 'assistant',
'content': 'According to the results, MoRA shows on par performances with LoRA on instruction tuning and mathematical reasoning. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training. However, LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256.'},
{'role': 'user',
'content': 'What are the main points of High-rank Updating on page 8?'},
{'role': 'tool_calls',
'content': '[{"name": "vector_tool_mora", "arguments": {"query": "What are the main points of High-rank Updating on page 8?", "page_numbers": ["8"]}}]'},
{'role': 'tool_results',
'content': '{"content": "1. The impact of high-rank updating on the rank of ∆W was demonstrated by analyzing the spectrum of singular values for the learned ∆W on a 250M pretraining model.\\n2. MoRA and ReMoRA exhibit a substantially higher number of significant singular values compared to LoRA and ReLoRA, highlighting the effectiveness of these methods in increasing the rank of ∆W.\\n3. The quantity of singular values shown in Figure 5 can be correlated with the perplexity metrics listed in Table 3.\\n4. MoRA, without the merge-and-reint strategy in ReLoRA and ReMoRA, can achieve a lower perplexity than ReLoRA along with a higher significant singular values."}'},
{'role': 'assistant',
'content': 'According to the main points of High-rank Updating on page 8, the impact of high-rank updating on the rank of ∆W was demonstrated by analyzing the spectrum of singular values for the learned ∆W on a 250M pretraining model. MoRA and ReMoRA exhibit a substantially higher number of significant singular values compared to LoRA and ReLoRA, highlighting the effectiveness of these methods in increasing the rank of ∆W. The quantity of singular values shown in Figure 5 can be correlated with the perplexity metrics listed in Table 3. MoRA, without the merge-and-reint strategy in ReLoRA and ReMoRA, can achieve a lower perplexity than ReLoRA along with a higher significant singular values.'}]
Ask the model to summarize the conversation
= agent.chat("Summarize our chat in bullet points")
response, _ print(response)
----------DEBUG----------
1. MoRA is a model that shows on par performances with LoRA on instruction tuning and mathematical reasoning.
2. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training.
3. LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA.
4. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning.
5. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256.
6. Different tasks show different requirements for fine-tuning capabilities. For instruction tuning, rank 8 is enough to achieve performance similar to FFT. For mathematical reasoning, rank 8 is unable to match FFT performance. However, increasing the rank from 8 to 256 can eliminate the performance gap.
7. For continual pretraining, LoRA with rank 256 still underperforms FFT.
8. The impact of high-rank updating on the rank of ∆W was demonstrated by analyzing the spectrum of singular values for the learned ∆W on a 250M pretraining model.
9. MoRA and ReMoRA exhibit a substantially higher number of significant singular values compared to LoRA and ReLoRA, highlighting the effectiveness of these methods in increasing the rank of ∆W.
10. The quantity of singular values shown in Figure 5 can be correlated with the perplexity metrics listed in Table 3.
11. MoRA, without the merge-and-reint strategy in ReLoRA and ReMoRA, can achieve a lower perplexity than ReLoRA along with a higher significant singular values.
-------------------------
1. MoRA is a model that shows on par performances with LoRA on instruction tuning and mathematical reasoning.
2. MoRA benefits from high-rank updating to memorize new knowledge and outperforms LoRA on both biomedical and financial domains for continual pre-training.
3. LoRA variants exhibit similar performances on these fine-tuning tasks as compared to LoRA.
4. Although AsyLoRA achieves the best performance in instruction tuning, it demonstrates poor performance in mathematical reasoning.
5. For ReLoRA, merging low-rank matrices during training can harm performance, particularly at the high rank like 256.
6. Different tasks show different requirements for fine-tuning capabilities. For instruction tuning, rank 8 is enough to achieve performance similar to FFT. For mathematical reasoning, rank 8 is unable to match FFT performance. However, increasing the rank from 8 to 256 can eliminate the performance gap.
7. For continual pretraining, LoRA with rank 256 still underperforms FFT.
8. The impact of high-rank updating on the rank of ∆W was demonstrated by analyzing the spectrum of singular values for the learned ∆W on a 250M pretraining model.
9. MoRA and ReMoRA exhibit a substantially higher number of significant singular values compared to LoRA and ReLoRA, highlighting the effectiveness of these methods in increasing the rank of ∆W.
10. The quantity of singular values shown in Figure 5 can be correlated with the perplexity metrics listed in Table 3.
11. MoRA, without the merge-and-reint strategy in ReLoRA and ReMoRA, can achieve a lower perplexity than ReLoRA along with a higher significant singular values.