在 Amazon Bedrock 上使用 Cohere Rerank 重新排序搜索结果
本教程将向您展示如何使用托管在 Amazon Bedrock 上的 Cohere Rerank 模型 在 Amazon OpenSearch Service 和自管理 OpenSearch 中实现搜索结果重新排序。
一个重新排序管道可以重新排序搜索结果,为搜索结果中的每个文档提供相对于搜索查询的相关性分数。相关性分数由交叉编码器模型计算。
将以 your_
为前缀的占位符替换为您自己的值。
先决条件:在 Amazon Bedrock 上测试模型
在使用模型之前,请使用以下代码在 Amazon Bedrock 上测试它
import json
import boto3
bedrock_region = "your_bedrock_model_region_like_us-west-2"
bedrock_runtime_client = boto3.client("bedrock-runtime", region_name=bedrock_region)
modelId = "cohere.rerank-v3-5:0"
contentType = "application/json"
accept = "*/*"
body = json.dumps({
"query": "What is the capital city of America?",
"documents": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."
],
"api_version": 2
})
response = bedrock_runtime_client.invoke_model(
modelId=modelId,
contentType=contentType,
accept=accept,
body=body
)
results = json.loads(response.get('body').read())["results"]
print(json.dumps(results, indent=2))
响应包含按相关性分数排序的重新排序结果
[
{
"index": 2,
"relevance_score": 0.7190094
},
{
"index": 0,
"relevance_score": 0.32418242
},
{
"index": 1,
"relevance_score": 0.07456104
},
{
"index": 3,
"relevance_score": 0.06124987
}
]
要按索引对结果进行排序,请使用以下代码:
print(json.dumps(sorted(results, key=lambda x: x['index']), indent=2))
排序后的结果如下
[
{
"index": 0,
"relevance_score": 0.32418242
},
{
"index": 1,
"relevance_score": 0.07456104
},
{
"index": 2,
"relevance_score": 0.7190094
},
{
"index": 3,
"relevance_score": 0.06124987
}
]
步骤 1:创建连接器并注册模型
要为模型创建连接器,请发送以下请求。
如果您使用自管理的 OpenSearch,请提供您的 AWS 凭证:
POST /_plugins/_ml/connectors/_create
{
"name": "Amazon Bedrock Cohere rerank model",
"description": "Test connector for Amazon Bedrock Cohere rerank model",
"version": 1,
"protocol": "aws_sigv4",
"credential": {
"access_key": "your_access_key",
"secret_key": "your_secret_key",
"session_token": "your_session_token"
},
"parameters": {
"service_name": "bedrock",
"endpoint": "bedrock-runtime",
"region": "your_bedrock_model_region_like_us-west-2",
"model_name": "cohere.rerank-v3-5:0",
"api_version": 2
},
"actions": [
{
"action_type": "PREDICT",
"method": "POST",
"url": "https://${parameters. endpoint}.${parameters.region}.amazonaws.com/model/${parameters.model_name}/invoke",
"headers": {
"x-amz-content-sha256": "required",
"content-type": "application/json"
},
"pre_process_function": """
def query_text = params.query_text;
def text_docs = params.text_docs;
def textDocsBuilder = new StringBuilder('[');
for (int i=0; i<text_docs.length; i++) {
textDocsBuilder.append('"');
textDocsBuilder.append(text_docs[i]);
textDocsBuilder.append('"');
if (i<text_docs.length - 1) {
textDocsBuilder.append(',');
}
}
textDocsBuilder.append(']');
def parameters = '{ "query": "' + query_text + '", "documents": ' + textDocsBuilder.toString() + ' }';
return '{"parameters": ' + parameters + '}';
""",
"request_body": """
{
"documents": ${parameters.documents},
"query": "${parameters.query}",
"api_version": ${parameters.api_version}
}
""",
"post_process_function": """
if (params.results == null || params.results.length == 0) {
throw new IllegalArgumentException("Post process function input is empty.");
}
def outputs = params.results;
def relevance_scores = new Double[outputs.length];
for (int i=0; i<outputs.length; i++) {
def index = new BigDecimal(outputs[i].index.toString()).intValue();
relevance_scores[index] = outputs[i].relevance_score;
}
def resultBuilder = new StringBuilder('[');
for (int i=0; i<relevance_scores.length; i++) {
resultBuilder.append(' {"name": "similarity", "data_type": "FLOAT32", "shape": [1],');
resultBuilder.append('"data": [');
resultBuilder.append(relevance_scores[i]);
resultBuilder.append(']}');
if (i<outputs.length - 1) {
resultBuilder.append(',');
}
}
resultBuilder.append(']');
return resultBuilder.toString();
"""
}
]
}
如果您使用的是 Amazon OpenSearch Service,您可以提供一个允许访问 Amazon Bedrock 的 AWS Identity and Access Management (IAM) 角色 Amazon Resource Name (ARN)
POST /_plugins/_ml/connectors/_create
{
"name": "Amazon Bedrock Cohere rerank model",
"description": "Test connector for Amazon Bedrock Cohere rerank model",
"version": 1,
"protocol": "aws_sigv4",
"credential": {
"roleArn": "your_role_arn_which_allows_access_to_bedrock_model"
},
"parameters": {
"service_name": "bedrock",
"endpoint": "bedrock-runtime",
"region": "your_bedrock_model_region_like_us-west-2",
"model_name": "cohere.rerank-v3-5:0",
"api_version": 2
},
"actions": [
{
"action_type": "PREDICT",
"method": "POST",
"url": "https://${parameters. endpoint}.${parameters.region}.amazonaws.com/model/${parameters.model_name}/invoke",
"headers": {
"x-amz-content-sha256": "required",
"content-type": "application/json"
},
"pre_process_function": """
def query_text = params.query_text;
def text_docs = params.text_docs;
def textDocsBuilder = new StringBuilder('[');
for (int i=0; i<text_docs.length; i++) {
textDocsBuilder.append('"');
textDocsBuilder.append(text_docs[i]);
textDocsBuilder.append('"');
if (i<text_docs.length - 1) {
textDocsBuilder.append(',');
}
}
textDocsBuilder.append(']');
def parameters = '{ "query": "' + query_text + '", "documents": ' + textDocsBuilder.toString() + ' }';
return '{"parameters": ' + parameters + '}';
""",
"request_body": """
{
"documents": ${parameters.documents},
"query": "${parameters.query}",
"api_version": ${parameters.api_version}
}
""",
"post_process_function": """
if (params.results == null || params.results.length == 0) {
throw new IllegalArgumentException("Post process function input is empty.");
}
def outputs = params.results;
def relevance_scores = new Double[outputs.length];
for (int i=0; i<outputs.length; i++) {
def index = new BigDecimal(outputs[i].index.toString()).intValue();
relevance_scores[index] = outputs[i].relevance_score;
}
def resultBuilder = new StringBuilder('[');
for (int i=0; i<relevance_scores.length; i++) {
resultBuilder.append(' {"name": "similarity", "data_type": "FLOAT32", "shape": [1],');
resultBuilder.append('"data": [');
resultBuilder.append(relevance_scores[i]);
resultBuilder.append(']}');
if (i<outputs.length - 1) {
resultBuilder.append(',');
}
}
resultBuilder.append(']');
return resultBuilder.toString();
"""
}
]
}
有关更多信息,请参阅 AWS 文档。
使用响应中的连接器 ID 注册和部署模型:
POST /_plugins/_ml/models/_register?deploy=true
{
"name": "Amazon Bedrock Cohere rerank model",
"function_name": "remote",
"description": "test rerank model",
"connector_id": "your_connector_id"
}
记下响应中的模型 ID;您将在后续步骤中使用它。
使用 Predict API 测试模型:
POST _plugins/_ml/models/your_model_id/_predict
{
"parameters": {
"query": "What is the capital city of America?",
"documents": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."
]
}
}
或者,您可以按如下方式测试模型
POST _plugins/_ml/_predict/text_similarity/your_model_id
{
"query_text": "What is the capital city of America?",
"text_docs": [
"Carson City is the capital city of the American state of Nevada.",
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan.",
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district.",
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."
]
}
连接器 pre_process_function
将输入转换为之前所示参数所需的格式。
默认情况下,Amazon Bedrock Rerank API 输出具有以下格式
[
{
"index": 2,
"relevance_score": 0.7190094
},
{
"index": 0,
"relevance_score": 0.32418242
},
{
"index": 1,
"relevance_score": 0.07456104
},
{
"index": 3,
"relevance_score": 0.06124987
}
]
连接器 post_process_function
将模型的输出转换为重新排序处理器可以解释的格式,并按索引顺序排列结果。这种适配的格式如下
{
"inference_results": [
{
"output": [
{
"name": "similarity",
"data_type": "FLOAT32",
"shape": [
1
],
"data": [
0.32418242
]
},
{
"name": "similarity",
"data_type": "FLOAT32",
"shape": [
1
],
"data": [
0.07456104
]
},
{
"name": "similarity",
"data_type": "FLOAT32",
"shape": [
1
],
"data": [
0.7190094
]
},
{
"name": "similarity",
"data_type": "FLOAT32",
"shape": [
1
],
"data": [
0.06124987
]
}
],
"status_code": 200
}
]
}
响应包含四个 similarity
对象。对于每个 similarity
对象,data
数组包含每个文档相对于查询的相关性分数。similarity
对象按照输入文档的顺序提供——第一个对象对应第一个文档。这与 Cohere Rerank 模型的默认输出不同,后者按相关性分数对文档进行排序。文档顺序在 connector.post_process.cohere.rerank
后处理函数中进行了更改,以便输出与重新排序管道兼容。
步骤 2:配置重新排序管道
请按照以下步骤配置重新排序管道。
步骤 2.1:摄取测试数据
发送批量请求以摄取测试数据
POST _bulk
{ "index": { "_index": "my-test-data" } }
{ "passage_text" : "Carson City is the capital city of the American state of Nevada." }
{ "index": { "_index": "my-test-data" } }
{ "passage_text" : "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan." }
{ "index": { "_index": "my-test-data" } }
{ "passage_text" : "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district." }
{ "index": { "_index": "my-test-data" } }
{ "passage_text" : "Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states." }
步骤 2.2:创建重新排序管道
使用 Cohere Rerank 模型创建重新排序管道
PUT /_search/pipeline/rerank_pipeline_bedrock
{
"description": "Pipeline for reranking with Bedrock Cohere rerank model",
"response_processors": [
{
"rerank": {
"ml_opensearch": {
"model_id": "your_model_id_created_in_step1"
},
"context": {
"document_fields": ["passage_text"]
}
}
}
]
}
如果您在 document_fields
中提供多个字段名称,则所有字段的值将首先连接,然后执行重新排序。
步骤 2.3:测试重新排序
要限制返回结果的数量,可以指定 size
参数。例如,设置 "size": 2
以返回前两个文档。
首先,在不使用重新排序管道的情况下测试查询:
POST my-test-data/_search
{
"query": {
"match": {
"passage_text": "What is the capital city of America?"
}
},
"highlight": {
"pre_tags": ["<strong>"],
"post_tags": ["</strong>"],
"fields": {"passage_text": {}}
},
"_source": false,
"fields": ["passage_text"]
}
响应中的第一个文档是 Carson City is the capital city of the American state of Nevada
,这是不正确的。
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": 2.5045562,
"hits": [
{
"_index": "my-test-data",
"_id": "1",
"_score": 2.5045562,
"fields": {
"passage_text": [
"Carson City is the capital city of the American state of Nevada."
]
},
"highlight": {
"passage_text": [
"Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada."
]
}
},
{
"_index": "my-test-data",
"_id": "2",
"_score": 0.5807494,
"fields": {
"passage_text": [
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan."
]
},
"highlight": {
"passage_text": [
"<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.",
"Its <strong>capital</strong> <strong>is</strong> Saipan."
]
}
},
{
"_index": "my-test-data",
"_id": "3",
"_score": 0.5261191,
"fields": {
"passage_text": [
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district."
]
},
"highlight": {
"passage_text": [
"(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>",
"<strong>of</strong> <strong>the</strong> United States.",
"It <strong>is</strong> a federal district."
]
}
},
{
"_index": "my-test-data",
"_id": "4",
"_score": 0.5083029,
"fields": {
"passage_text": [
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."
]
},
"highlight": {
"passage_text": [
"<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States",
"As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states."
]
}
}
]
}
}
接下来,使用重新排序管道测试查询:
POST my-test-data/_search?search_pipeline=rerank_pipeline_bedrock
{
"query": {
"match": {
"passage_text": "What is the capital city of America?"
}
},
"ext": {
"rerank": {
"query_context": {
"query_text": "What is the capital city of America?"
}
}
},
"highlight": {
"pre_tags": ["<strong>"],
"post_tags": ["</strong>"],
"fields": {"passage_text": {}}
},
"_source": false,
"fields": ["passage_text"]
}
响应中的第一个文档是 "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district."
,这是正确的
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": 0.7190094,
"hits": [
{
"_index": "my-test-data",
"_id": "3",
"_score": 0.7190094,
"fields": {
"passage_text": [
"Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district."
]
},
"highlight": {
"passage_text": [
"(also known as simply Washington or D.C., and officially as <strong>the</strong> District <strong>of</strong> Columbia) <strong>is</strong> <strong>the</strong> <strong>capital</strong>",
"<strong>of</strong> <strong>the</strong> United States.",
"It <strong>is</strong> a federal district."
]
}
},
{
"_index": "my-test-data",
"_id": "1",
"_score": 0.32418242,
"fields": {
"passage_text": [
"Carson City is the capital city of the American state of Nevada."
]
},
"highlight": {
"passage_text": [
"Carson <strong>City</strong> <strong>is</strong> <strong>the</strong> <strong>capital</strong> <strong>city</strong> <strong>of</strong> <strong>the</strong> American state <strong>of</strong> Nevada."
]
}
},
{
"_index": "my-test-data",
"_id": "2",
"_score": 0.07456104,
"fields": {
"passage_text": [
"The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean. Its capital is Saipan."
]
},
"highlight": {
"passage_text": [
"<strong>The</strong> Commonwealth <strong>of</strong> <strong>the</strong> Northern Mariana Islands <strong>is</strong> a group <strong>of</strong> islands in <strong>the</strong> Pacific Ocean.",
"Its <strong>capital</strong> <strong>is</strong> Saipan."
]
}
},
{
"_index": "my-test-data",
"_id": "4",
"_score": 0.06124987,
"fields": {
"passage_text": [
"Capital punishment (the death penalty) has existed in the United States since beforethe United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states."
]
},
"highlight": {
"passage_text": [
"<strong>Capital</strong> punishment (<strong>the</strong> death penalty) has existed in <strong>the</strong> United States since beforethe United States",
"As <strong>of</strong> 2017, <strong>capital</strong> punishment <strong>is</strong> legal in 30 <strong>of</strong> <strong>the</strong> 50 states."
]
}
}
]
},
"profile": {
"shards": []
}
}
为避免两次编写查询,请使用 query_text_path
而非 query_text
,如下所示
POST my-test-data/_search?search_pipeline=rerank_pipeline_bedrock
{
"query": {
"match": {
"passage_text": "What is the capital city of America?"
}
},
"ext": {
"rerank": {
"query_context": {
"query_text_path": "query.match.passage_text.query"
}
}
},
"highlight": {
"pre_tags": ["<strong>"],
"post_tags": ["</strong>"],
"fields": {"passage_text": {}}
},
"_source": false,
"fields": ["passage_text"]
}