Ko-MTEB 따라잡기 (작성 중)
P
paper Lee
from util import get_model_infos
TASK_RETRIEVAL = [
# "Ko-StrategyQA",
"AutoRAGRetrieval",
# "XPQARetrieval",
# "BelebeleRetrieval",
# "MultiLongDocRetrieval",
# "PublicHealthQA"
]
def test_evaluation(model_name, tasks):
model = None
model_card_info = get_model_infos(model_name)
if model_card_info["eval_library"] == "sentence-transformers":
# if model_name == "dragonkue/snowflake-arctic-embed-l-v2.0-ko":
model = SentenceTransformer(model_name)
bch = 3
evaluation = MTEB(tasks=get_tasks(tasks=tasks, languages=["kor-Kore", "kor-Hang", "kor_Hang"]))
evaluation.run(
model=model,
output_folder=f"results/{model_name}",
encode_kwargs={"batch_size":bch}
)
if __name__ == "__main__":
model_name = "dragonkue/snowflake-arctic-embed-l-v2.0-ko"
test_evaluation(model_name, TASK_RETRIEVAL)class ColBERTWrapper(Wrapper):
def __init__(
self,
model_name: str,
revision: str | None = None,
model_prompts: dict[str, str] | None = None,
**kwargs,
) -> None:
requires_package(self, "pylate", model_name, "pip install mteb[pylate]")
from pylate import models as colbert_model # type: ignore[import]
self.model_name = model_name
self.model = colbert_model.ColBERT(self.model_name, revision=revision, **kwargs)
built_in_prompts = getattr(self.model, "prompts", None)
if built_in_prompts and not model_prompts:
model_prompts = built_in_prompts
elif model_prompts and built_in_prompts:
logger.info(f"Model.prompts will be overwritten with {model_prompts}")
self.model.prompts = model_prompts
self.model_prompts = self.validate_task_to_prompt_name(model_prompts)
.....def ColBERT_test_Evaluation(model_name, tasks, device_number):
model = ColBERTWrapper(
model_name=model_name,
model_kwargs={
"torch_dtype": torch.float32 # f32
},
device=torch.device(f"cuda:{str(device_number)}")
)
if hasattr(model.model[0].tokenizer, "model_max_length"):
model.model[0].tokenizer.model_input_nmaes = ["input_ids", "attention_mask"] # token_type_ids가 필요없음
evaluation = MTEB(tasks=get_tasks(tasks=tasks, languages=["kor-Kore", "kor-Hang", "kor_Hang"]))
evaluation.run(
model=model,
output_folder=f"results/{model_name}",
encode_kwargs={"batch_size": 16}
)