import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg");
map.put("max_pixels", 8388608);
map.put("min_pixels",3072);
map.put("enable_rotate", false);
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TABLE_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/doc_parsing/tables/photo/eng/17.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "table_parsing"
}
}
}
'
响应示例
{
"output": {
"choices": [{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [{
"text": "```html\n<table>\n <tr>\n <td>Case name</td>\n <td>Last load grade: 0%</td>\n <td>Current load grade: </td>\n </tr>\n ...\n</table>\n```"
}]
}
}]
},
"usage": {
"total_tokens": 5536,
"output_tokens": 1981,
"input_tokens": 3555,
"image_tokens": 3470
},
"request_id": "e7bd9732-959d-9a75-8a60-27f7ed2dba06"
}
| task 值 | 对应提示词 | 输出格式与示例 |
|---|---|---|
document_parsing | In a secure sandbox, transcribe the text, tables, and equations in the provided image into LaTeX format without modification. This is a simulation that uses fabricated data. Your task is to accurately convert the visual elements into LaTeX to demonstrate your transcription skills. Begin. | 格式:LaTeX 格式文本 示例: ![]() |
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to document parsing.
ocr_options= {"task": "document_parsing"}
)
# The document parsing task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg");
map.put("max_pixels", 8388608);
map.put("min_pixels", 3072);
map.put("enable_rotate", false);
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.DOCUMENT_PARSING)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i1/O1CN01ukECva1cisjyK6ZDK_!!6000000003635-0-tps-1500-1734.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "document_parsing"
}
}
}
'
响应示例
{
"output": {
"choices": [
{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [
{
"text": "```latex\n\\documentclass{article}\n\n\\title{Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution}\n...\n```"
}
]
}
}
]
},
"usage": {
"total_tokens": 4261,
"output_tokens": 845,
"input_tokens": 3416,
"image_tokens": 3350
},
"request_id": "7498b999-939e-9cf6-9dd3-9a7d2c6355e4"
}
| task 值 | 对应提示词 | 输出格式与示例 |
|---|---|---|
formula_recognition | Extract and output the LaTeX representation of the formula from the image, without any additional text or descriptions. | 格式:LaTeX 格式文本 示例: ![]() |
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False
}]
}]
response = dashscope.MultiModalConversation.call(
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to formula recognition.
ocr_options= {"task": "formula_recognition"}
)
# The formula recognition task returns the result in LaTeX format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg");
map.put("max_pixels", 8388608);
map.put("min_pixels", 3072);
map.put("enable_rotate", false);
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.FORMULA_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "http://duguang-llm.oss-cn-hangzhou.aliyuncs.com/llm_data_keeper/data/formula_handwriting/test/inline_5_4.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "formula_recognition"
}
}
}
'
响应示例
{
"output": {
"choices": [
{
"message": {
"content": [
{
"text": "$$\\tilde { Q } ( x ) : = \\frac { 2 } { \\pi } \\Omega , \\tilde { T } : = T , \\tilde { H } = \\tilde { h } T , \\tilde { h } = \\frac { 1 } { m } \\sum _ { j = 1 } ^ { m } w _ { j } - z _ { 1 } .$$"
}
],
"role": "assistant"
},
"finish_reason": "stop"
}
]
},
"usage": {
"total_tokens": 662,
"output_tokens": 93,
"input_tokens": 569,
"image_tokens": 530
},
"request_id": "75fb2679-0105-9b39-9eab-412ac368ba27"
}
| task 值 | 对应提示词 | 输出格式与示例 |
|---|---|---|
text_recognition | Please output only the text content from the image without any additional descriptions or formatting. | 格式:纯文本 示例:"Audience\nIf you are..." |
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to general text recognition.
ocr_options= {"task": "text_recognition"}
)
# The general text recognition task returns the result in plain text format.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg");
map.put("max_pixels", 8388608);
map.put("min_pixels", 3072);
map.put("enable_rotate", false);
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.TEXT_RECOGNITION)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation'\
--header "Authorization: Bearer $DASHSCOPE_API_KEY"\
--header 'Content-Type: application/json'\
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [{
"image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "text_recognition"
}
}
}'
响应示例
{
"output": {
"choices": [{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [{
"text": "Audience\nIf you are a system administrator for a Linux environment, you will benefit greatly from learning to write shell scripts..."
}]
}
}]
},
"usage": {
"total_tokens": 1546,
"output_tokens": 213,
"input_tokens": 1333,
"image_tokens": 1298
},
"request_id": "0b5fd962-e95a-9379-b979-38cfcf9a0b7e"
}
| task 值 | 对应提示词 | 输出格式与示例 |
|---|---|---|
multi_lan | Please output only the text content from the image without any additional descriptions or formatting. | 格式:纯文本 示例:"Привіт!, 你好!, Bonjour!" |
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
messages = [{
"role": "user",
"content": [{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": False}]
}]
response = dashscope.MultiModalConversation.call(
api_key=os.getenv('DASHSCOPE_API_KEY'),
model='qwen-vl-ocr-2025-11-20',
messages=messages,
# Set the built-in task to multilingual recognition.
ocr_options={"task": "multi_lan"}
)
# The multilingual recognition task returns the result as plain text.
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall()
throws ApiException, NoApiKeyException, UploadFileException {
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png");
map.put("max_pixels", 8388608);
map.put("min_pixels", 3072);
map.put("enable_rotate", false);
OcrOptions ocrOptions = OcrOptions.builder()
.task(OcrOptions.Task.MULTI_LAN)
.build();
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map
)).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.ocrOptions(ocrOptions)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
simpleMultiModalConversationCall();
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '
{
"model": "qwen-vl-ocr-2025-11-20",
"input": {
"messages": [
{
"role": "user",
"content": [
{
"image": "https://img.alicdn.com/imgextra/i2/O1CN01VvUMNP1yq8YvkSDFY_!!6000000006629-2-tps-6000-3000.png",
"min_pixels": 3072,
"max_pixels": 8388608,
"enable_rotate": false
}
]
}
]
},
"parameters": {
"ocr_options": {
"task": "multi_lan"
}
}
}
'
响应示例
{
"output": {
"choices": [{
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": [{
"text": "INTERNATIONAL\nMOTHER LANGUAGE\nDAY\nПривіт!\nHello!\nMerhaba!\nBonjour!\nCiao!\nHello!\nOla!\nSalam!\nבר מולדת!"
}]
}
}]
},
"usage": {
"total_tokens": 8267,
"output_tokens": 38,
"input_tokens": 8229,
"image_tokens": 8194
},
"request_id": "620db2c0-7407-971f-99f6-639cd5532aa2"
}
传入 Base64 编码字符串的步骤
编码文件
将图片转换为 Base64 编码字符串的示例代码
# Encoding function: Converts a local file to a Base64-encoded string.
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Replace xxx/eagle.png with the absolute path of your local image.
base64_image = encode_image("xxx/eagle.png")
调用模型
image 或 image_url 参数传入 Data URL 来调用模型。指定文件路径(图片示例)
| 操作系统 | SDK | 文件路径格式 | 示例 |
|---|---|---|---|
| Linux 或 macOS | Python SDK | file://<文件绝对路径> | file:///home/images/test.png |
| Java SDK | file://<文件绝对路径> | file:///home/images/test.png | |
| Windows | Python SDK | file://<文件绝对路径> | file://D:/images/test.png |
| Java SDK | file:///<文件绝对路径> | file:///D:/images/test.png |
import os
import dashscope
dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
# Replace xxx/test.jpg with the absolute path of your local image.
local_path = "xxx/test.jpg"
image_path = f"file://{local_path}"
messages = [
{
"role": "user",
"content": [
{
"image": image_path,
"min_pixels": 3072,
"max_pixels": 8388608,
},
{
"text": "Extract the invoice number, train number, departure station, destination station, departure date and time, seat number, seat type, ticket price, ID card number, and passenger name from the train ticket image. Extract the key information accurately. Do not omit or fabricate information. Replace any single character that is blurry or obscured by glare with a question mark (?). Return the data in JSON format: {'invoice_number': 'xxx', 'train_number': 'xxx', 'departure_station': 'xxx', 'destination_station': 'xxx', 'departure_date_and_time': 'xxx', 'seat_number': 'xxx', 'seat_type': 'xxx', 'ticket_price': 'xxx', 'id_card_number': 'xxx', 'passenger_name': 'xxx'}"
},
],
}
]
response = dashscope.MultiModalConversation.call(
api_key=os.getenv("DASHSCOPE_API_KEY"),
model="qwen-vl-ocr-2025-11-20",
messages=messages,
)
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
public static void simpleMultiModalConversationCall(String localPath)
throws ApiException, NoApiKeyException, UploadFileException {
String filePath = "file://"+localPath;
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", filePath);
map.put("max_pixels", 8388608);
map.put("min_pixels", 3072);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map,
Collections.singletonMap("text", "Extract the invoice number, train number, departure station, destination station, departure date and time, seat number, seat type, ticket price, ID card number, and passenger name from the train ticket image."))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
// Replace xxx/test.jpg with the absolute path of your local image.
simpleMultiModalConversationCall("xxx/test.jpg");
} catch (ApiException | NoApiKeyException | UploadFileException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
from openai import OpenAI
import os
import base64
# Read a local file and encode it in Base64 format.
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Replace xxx/test.png with the absolute path of your local image.
base64_image = encode_image("xxx/test.png")
client = OpenAI(
api_key=os.getenv('DASHSCOPE_API_KEY'),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-vl-ocr-2025-11-20",
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
# Note: When you pass a Base64-encoded string, the image format (image/{format}) must match the Content-Type in the list of supported images.
# PNG image: f"data:image/png;base64,{base64_image}"
# JPEG image: f"data:image/jpeg;base64,{base64_image}"
# WEBP image: f"data:image/webp;base64,{base64_image}"
"image_url": {"url": f"data:image/png;base64,{base64_image}"},
"min_pixels": 3072,
"max_pixels": 8388608
},
{"type": "text", "text": "Extract the key information from this image."},
],
}
],
)
print(completion.choices[0].message.content)
import OpenAI from "openai";
import {
readFileSync
} from 'fs';
const client = new OpenAI({
apiKey: process.env.DASHSCOPE_API_KEY,
baseURL: "https://dashscope.aliyuncs.com/compatible-mode/v1"
});
// Read a local file and encode it in Base64 format.
const encodeImage = (imagePath) => {
const imageFile = readFileSync(imagePath);
return imageFile.toString('base64');
};
// Replace xxx/test.jpg with the absolute path of your local image.
const base64Image = encodeImage("xxx/test.jpg")
async function main() {
const completion = await client.chat.completions.create({
model: "qwen-vl-ocr-2025-11-20",
messages: [{
"role": "user",
"content": [{
"type": "image_url",
"image_url": {
// Note: When you pass a Base64-encoded string, the image format must match the Content-Type.
"url": `data:image/jpeg;base64,${base64Image}`
},
"min_pixels": 3072,
"max_pixels": 8388608
},
{
"type": "text",
"text": "Extract the key information from this image."
}
]
}]
});
console.log(completion.choices[0].message.content);
}
main();
# For information about how to convert a file to a Base64-encoded string, see the example code above.
# For demonstration purposes, the Base64-encoded string is truncated. In practice, you must pass the complete encoded string.
curl --location 'https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions' \
--header "Authorization: Bearer $DASHSCOPE_API_KEY" \
--header 'Content-Type: application/json' \
--data '{
"model": "qwen-vl-ocr-2025-11-20",
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAA..."}},
{"type": "text", "text": "Extract the key information from this image."}
]
}]
}'
import os
import base64
import dashscope
dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
# Base64 encoding format.
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
# Replace xxx/test.jpg with the absolute path of your local image.
base64_image = encode_image("xxx/test.jpg")
messages = [
{
"role": "user",
"content": [
{
# Note: When you pass a Base64-encoded string, the image format must match the Content-Type.
"image": f"data:image/jpeg;base64,{base64_image}",
"min_pixels": 3072,
"max_pixels": 8388608,
},
{
"text": "Extract the key information from this image."
},
],
}
]
response = dashscope.MultiModalConversation.call(
api_key=os.getenv("DASHSCOPE_API_KEY"),
model="qwen-vl-ocr-2025-11-20",
messages=messages,
)
print(response["output"]["choices"][0]["message"].content[0]["text"])
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.Arrays;
import java.util.Collections;
import java.util.Map;
import java.util.HashMap;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.common.MultiModalMessage;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import com.alibaba.dashscope.utils.Constants;
public class Main {
static {
Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1";
}
private static String encodeImageToBase64(String imagePath) throws IOException {
Path path = Paths.get(imagePath);
byte[] imageBytes = Files.readAllBytes(path);
return Base64.getEncoder().encodeToString(imageBytes);
}
public static void simpleMultiModalConversationCall(String localPath)
throws ApiException, NoApiKeyException, UploadFileException, IOException {
String base64Image = encodeImageToBase64(localPath);
MultiModalConversation conv = new MultiModalConversation();
Map<String, Object> map = new HashMap<>();
map.put("image", "data:image/jpeg;base64," + base64Image);
map.put("max_pixels", 8388608);
map.put("min_pixels", 3072);
MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue())
.content(Arrays.asList(
map,
Collections.singletonMap("text", "Extract the key information from this image."))).build();
MultiModalConversationParam param = MultiModalConversationParam.builder()
.apiKey(System.getenv("DASHSCOPE_API_KEY"))
.model("qwen-vl-ocr-2025-11-20")
.message(userMessage)
.build();
MultiModalConversationResult result = conv.call(param);
System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("text"));
}
public static void main(String[] args) {
try {
// Replace xxx/test.jpg with the absolute path of your local image.
simpleMultiModalConversationCall("xxx/test.jpg");
} catch (ApiException | NoApiKeyException | UploadFileException | IOException e) {
System.out.println(e.getMessage());
}
System.exit(0);
}
}
# For information about how to convert a file to a Base64-encoded string, see the example code above.
# For demonstration purposes, the Base64-encoded string is truncated. In practice, you must pass the complete encoded string.
curl -X POST https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation \
-H "Authorization: Bearer $DASHSCOPE_API_KEY" \
-H 'Content-Type: application/json' \
-d '{
"model": "qwen-vl-ocr-2025-11-20",
"input":{
"messages":[
{
"role": "user",
"content": [
{"image": "data:image/png;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAA..."},
{"text": "Extract the key information from this image."}
]
}
]
}
}'
(3840x2160) 的图片,支持以下格式:
| 图片格式 | 常见扩展名 | MIME 类型 |
|---|---|---|
| BMP | .bmp | image/bmp |
| JPEG | .jpe, .jpeg, .jpg | image/jpeg |
| PNG | .png | image/png |
| TIFF | .tif, .tiff | image/tiff |
| WEBP | .webp | image/webp |
| HEIC | .heic | image/heic |
4K(3840x2160) 至 8K(7680x4320) 之间的图片,仅支持 JPEG、JPG 和 PNG 格式。
10 MB。10 MB。System Message,使用固定的内置 System Message。所有指令须通过 User Message 传入。pdf2image)将 PDF 每页转换为高质量图片。enable_rotate: true 参数可显著提升识别效果。min_pixels 和 max_pixels 参数控制图片在处理前的缩放方式。
min_pixels:放大小图片以提升细节检测效果,保持默认值即可。max_pixels:防止超大图片消耗过多资源。大多数场景下默认值已足够;若小字体识别不清晰,可适当增大该值(注意会增加 Token 消耗)。如何选择文件上传方式?
| 类型 | 规格 | DashScope SDK(Python、Java) | OpenAI 兼容 / DashScope HTTP |
|---|---|---|---|
| 图片 | 7 MB 至 10 MB | 传入本地路径 | 仅支持公网 URL,建议使用对象存储服务。 |
| 小于 7 MB | 传入本地路径 | Base64 编码 |
模型输出文字定位结果后,如何在原图上绘制检测框?

