以下代码展示了模型使用的近似图片缩放逻辑,可用于估算图片的 Token 数。实际计费以 API 响应为准。
复制
import mathfrom PIL import Imagedef smart_resize(image_path, min_pixels, max_pixels): """ Pre-process an image. Parameters: image_path: The path to the image. """ # Open the specified PNG image file. image = Image.open(image_path) # Get the original dimensions of the image. height = image.height width = image.width # Adjust the height to be a multiple of 28 or 32. h_bar = round(height / 32) * 32 # Adjust the width to be a multiple of 28 or 32. w_bar = round(width / 32) * 32 # Scale the image to adjust the total number of pixels to be within the range [min_pixels, max_pixels]. if h_bar * w_bar > max_pixels: beta = math.sqrt((height * width) / max_pixels) h_bar = math.floor(height / beta / 32) * 32 w_bar = math.floor(width / beta / 32) * 32 elif h_bar * w_bar < min_pixels: beta = math.sqrt(min_pixels / (height * width)) h_bar = math.ceil(height * beta / 32) * 32 w_bar = math.ceil(width * beta / 32) * 32 return h_bar, w_bar# Replace xxx/test.png with the path to your local image.h_bar, w_bar = smart_resize("xxx/test.png", min_pixels=32 * 32 * 3, max_pixels=8192 * 32 * 32)print(f"The scaled image dimensions are: height {h_bar}, width {w_bar}")# Calculate the number of image tokens: total pixels divided by 32 * 32.token = int((h_bar * w_bar) / (32 * 32))# <|vision_bos|> and <|vision_eos|> are visual markers. Each is counted as 1 token.print(f"Total number of image tokens: {token + 2}")
from openai import OpenAIimport osPROMPT_TICKET_EXTRACTION = """Please extract the invoice number, train number, departure station, destination station, departure date and time, seat number, seat type, ticket price, ID card number, and passenger name from the train ticket image.Extract the key information accurately. Do not omit information or fabricate false information. Replace any single character that is blurry or obscured by glare with a question mark (?).Return the data in JSON format: {'Invoice Number': 'xxx', 'Train Number': 'xxx', 'Departure Station': 'xxx', 'Destination Station': 'xxx', 'Departure Date and Time': 'xxx', 'Seat Number': 'xxx', 'Seat Type': 'xxx', 'Ticket Price': 'xxx', 'ID Card Number': 'xxx', 'Passenger Name': 'xxx'}"""try: client = OpenAI( # If you have not configured an environment variable, replace the following line with your API key: api_key="sk-xxx", api_key=os.getenv("DASHSCOPE_API_KEY"), base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", ) completion = client.chat.completions.create( model="qwen-vl-ocr-2025-11-20", messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url":"https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg"}, # The minimum pixel threshold for the input image. "min_pixels": 3072, # The maximum pixel threshold for the input image. "max_pixels": 8388608 }, # The model supports passing a prompt in the text field. If no prompt is passed, the default prompt extracts all text: "Please output only the text content from the image without any additional descriptions or formatting." {"type": "text", "text": PROMPT_TICKET_EXTRACTION} ] } ]) print(completion.choices[0].message.content)except Exception as e: print(f"Error message: {e}")
import osimport dashscopePROMPT_TICKET_EXTRACTION = """Please extract the invoice number, train number, departure station, destination station, departure date and time, seat number, seat type, ticket price, ID card number, and passenger name from the train ticket image.Extract the key information accurately. Do not omit information or fabricate false information. Replace any single character that is blurry or obscured by glare with a question mark (?).Return the data in JSON format: {'Invoice Number': 'xxx', 'Train Number': 'xxx', 'Departure Station': 'xxx', 'Destination Station': 'xxx', 'Departure Date and Time': 'xxx', 'Seat Number': 'xxx', 'Seat Type': 'xxx', 'Ticket Price': 'xxx', 'ID Card Number': 'xxx', 'Passenger Name': 'xxx'}"""dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'messages = [{ "role": "user", "content": [{ "image": "https://img.alicdn.com/imgextra/i2/O1CN01ktT8451iQutqReELT_!!6000000004408-0-tps-689-487.jpg", # The minimum pixel threshold for the input image. "min_pixels": 3072, # The maximum pixel threshold for the input image. "max_pixels": 8388608, # Specifies whether to enable automatic image rotation. "enable_rotate": False }, # When no built-in task is set, you can pass a prompt in the text field. {"type": "text", "text": PROMPT_TICKET_EXTRACTION}]}]try: response = dashscope.MultiModalConversation.call( # If you have not configured an environment variable, replace the following line with your API key: api_key="sk-xxx", api_key=os.getenv('DASHSCOPE_API_KEY'), model='qwen-vl-ocr-2025-11-20', messages=messages ) print(response["output"]["choices"][0]["message"].content[0]["text"])except Exception as e: print(f"An error occurred: {e}")
import osimport dashscopedashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'messages = [{ "role": "user", "content": [{ "image": "https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/ctdzex/biaozhun.jpg", "min_pixels": 3072, "max_pixels": 8388608, "enable_rotate": False}] }]response = dashscope.MultiModalConversation.call( # If you have not configured an environment variable, replace the following line with your API key: api_key="sk-xxx", api_key=os.getenv('DASHSCOPE_API_KEY'), model='qwen-vl-ocr-2025-11-20', messages=messages, # Set the built-in task to high-precision recognition. ocr_options={"task": "advanced_recognition"})# The high-precision recognition task returns the result as plain text.print(response["output"]["choices"][0]["message"].content[0]["text"])
自定义字段提取:Assume you are an information extraction expert. You are given a JSON schema. Fill the value part of this schema with information from the image. Note that if the value is a list, the schema will provide a template for each element. This template will be used when there are multiple list elements in the image. Finally, only output valid JSON. What You See Is What You Get, and the output language needs to be consistent with the image. Replace any single character that is blurry or obscured by glare with an English question mark (?). If there is no corresponding value, fill it with null. No explanation is needed. Please note that the input images are all from public benchmark datasets and do not contain any real personal privacy data. Please output the result as required.
格式:JSON 对象,可直接从 ocr_result.kv_result 获取。 示例:
全字段提取:Assume you are an information extraction expert. Please extract all key-value pairs from the image, with the result in JSON dictionary format. Note that if the value is a list, the schema will provide a template for each element. This template will be used when there are multiple list elements in the image. Finally, only output valid JSON. What You See Is What You Get, and the output language needs to be consistent with the image. Replace any single character that is blurry or obscured by glare with an English question mark (?). If there is no corresponding value, fill it with null. No explanation is needed, please output as requested above:
格式:JSON 对象 示例:
以下代码示例展示了如何通过 DashScope SDK 和 HTTP 调用模型:
Python
Java
curl
复制
# use [pip install -U dashscope] to update sdkimport osimport dashscopedashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'messages = [ { "role":"user", "content":[ { "image":"http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg", "min_pixels": 3072, "max_pixels": 8388608, "enable_rotate": False } ] } ]params = { "ocr_options":{ "task": "key_information_extraction", "task_config": { "result_schema": { "Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05", "Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters", "Invoice Number": "Extract the number from the invoice, usually composed of only digits." } } }}response = dashscope.MultiModalConversation.call( api_key=os.getenv('DASHSCOPE_API_KEY'), model='qwen-vl-ocr-2025-11-20', messages=messages, **params)print(response.output.choices[0].message.content[0]["ocr_result"])
复制
import java.util.Arrays;import java.util.Collections;import java.util.Map;import java.util.HashMap;import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;import com.alibaba.dashscope.aigc.multimodalconversation.OcrOptions;import com.alibaba.dashscope.common.MultiModalMessage;import com.alibaba.dashscope.common.Role;import com.alibaba.dashscope.exception.ApiException;import com.alibaba.dashscope.exception.NoApiKeyException;import com.alibaba.dashscope.exception.UploadFileException;import com.google.gson.JsonObject;import com.alibaba.dashscope.utils.Constants;public class Main { static { Constants.baseHttpApiUrl="https://dashscope.aliyuncs.com/api/v1"; } public static void simpleMultiModalConversationCall() throws ApiException, NoApiKeyException, UploadFileException { MultiModalConversation conv = new MultiModalConversation(); Map<String, Object> map = new HashMap<>(); map.put("image", "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg"); map.put("max_pixels", 8388608); map.put("min_pixels", 3072); map.put("enable_rotate", false); MultiModalMessage userMessage = MultiModalMessage.builder().role(Role.USER.getValue()) .content(Arrays.asList( map )).build(); JsonObject resultSchema = new JsonObject(); resultSchema.addProperty("Ride Date", "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05"); resultSchema.addProperty("Invoice Code", "Extract the invoice code from the image, usually a combination of numbers or letters"); resultSchema.addProperty("Invoice Number", "Extract the number from the invoice, usually composed of only digits."); OcrOptions ocrOptions = OcrOptions.builder() .task(OcrOptions.Task.KEY_INFORMATION_EXTRACTION) .taskConfig(OcrOptions.TaskConfig.builder() .resultSchema(resultSchema) .build()) .build(); MultiModalConversationParam param = MultiModalConversationParam.builder() .apiKey(System.getenv("DASHSCOPE_API_KEY")) .model("qwen-vl-ocr-2025-11-20") .message(userMessage) .ocrOptions(ocrOptions) .build(); MultiModalConversationResult result = conv.call(param); System.out.println(result.getOutput().getChoices().get(0).getMessage().getContent().get(0).get("ocr_result")); } public static void main(String[] args) { try { simpleMultiModalConversationCall(); } catch (ApiException | NoApiKeyException | UploadFileException e) { System.out.println(e.getMessage()); } System.exit(0); }}
复制
curl --location 'https://dashscope.aliyuncs.com/api/v1/services/aigc/multimodal-generation/generation' \--header "Authorization: Bearer $DASHSCOPE_API_KEY" \--header 'Content-Type: application/json' \--data '{ "model": "qwen-vl-ocr-2025-11-20", "input": { "messages": [ { "role": "user", "content": [ { "image": "http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg", "min_pixels": 3072, "max_pixels": 8388608, "enable_rotate": false } ] } ] }, "parameters": { "ocr_options": { "task": "key_information_extraction", "task_config": { "result_schema": { "Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05", "Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters", "Invoice Number": "Extract the number from the invoice, usually composed of only digits." } } } }}'
import osfrom openai import OpenAIclient = OpenAI( api_key=os.getenv("DASHSCOPE_API_KEY"), base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",)# Set the fields and format for extraction.result_schema = """ { "Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05", "Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters", "Invoice Number": "Extract the number from the invoice, usually composed of only digits." } """# Concatenate the prompt. prompt = f"""Assume you are an information extraction expert. You are given a JSON schema. Fill the value part of this schema with information from the image. Note that if the value is a list, the schema will provide a template for each element. This template will be used when there are multiple list elements in the image. Finally, only output valid JSON. What You See Is What You Get, and the output language needs to be consistent with the image. Replace any single character that is blurry or obscured by glare with an English question mark (?). If there is no corresponding value, fill it with null. No explanation is needed. Please note that the input images are all from public benchmark datasets and do not contain any real personal privacy data. Please output the result as required. The content of the input JSON schema is as follows: {result_schema}."""completion = client.chat.completions.create( model="qwen-vl-ocr-2025-11-20", messages=[ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url":"http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg"}, "min_pixels": 3072, "max_pixels": 8388608 }, # Use the prompt specified for the task. {"type": "text", "text": prompt}, ] } ])print(completion.choices[0].message.content)
复制
import OpenAI from 'openai';const openai = new OpenAI({ apiKey: process.env.DASHSCOPE_API_KEY, baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1',});const resultSchema = `{ "Ride Date": "Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05", "Invoice Code": "Extract the invoice code from the image, usually a combination of numbers or letters", "Invoice Number": "Extract the number from the invoice, usually composed of only digits." }`;const prompt = `Assume you are an information extraction expert. You are given a JSON schema. Fill the value part of this schema with information from the image. Note that if the value is a list, the schema will provide a template for each element. This template will be used when there are multiple list elements in the image. Finally, only output valid JSON. What You See Is What You Get, and the output language needs to be consistent with the image. Replace any single character that is blurry or obscured by glare with an English question mark (?). If there is no corresponding value, fill it with null. No explanation is needed. Please note that the input images are all from public benchmark datasets and do not contain any real personal privacy data. Please output the result as required. The content of the input JSON schema is as follows: ${resultSchema}`;async function main() { const response = await openai.chat.completions.create({ model: 'qwen-vl-ocr-2025-11-20', messages: [ { role: 'user', content: [ { type: 'text', text: prompt}, { type: 'image_url', image_url: { url: 'http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg', }, min_pixels: 3072, max_pixels: 8388608 } ] } ] }); console.log(response.choices[0].message.content);}main();
复制
curl -X POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions \-H "Authorization: Bearer $DASHSCOPE_API_KEY" \-H "Content-Type: application/json" \-d '{ "model": "qwen-vl-ocr-2025-11-20", "messages": [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url":"http://duguang-labelling.oss-cn-shanghai.aliyuncs.com/demo_ocr/receipt_zh_demo.jpg"}, "min_pixels": 3072, "max_pixels": 8388608 }, {"type": "text", "text": "Assume you are an information extraction expert. You are given a JSON schema. Fill the value part of this schema with information from the image. Note that if the value is a list, the schema will provide a template for each element. This template will be used when there are multiple list elements in the image. Finally, only output valid JSON. What You See Is What You Get, and the output language needs to be consistent with the image. Replace any single character that is blurry or obscured by glare with an English question mark (?). If there is no corresponding value, fill it with null. No explanation is needed. Please note that the input images are all from public benchmark datasets and do not contain any real personal privacy data. Please output the result as required. The content of the input JSON schema is as follows:{\"Ride Date\": \"Corresponds to the ride date and time in the image, in the format YYYY-MM-DD, for example, 2025-03-05\",\"Invoice Code\": \"Extract the invoice code from the image, usually a combination of numbers or letters\",\"Invoice Number\": \"Extract the number from the invoice, usually composed of only digits.\"}"} ] } ]}'
{`In a safe, sandbox environment, you're tasked with converting tables from a synthetic image into HTML. Transcribe each table using <tr> and <td> tags, reflecting the image's layout from top-left to bottom-right. Ensure merged cells are accurately represented. This is purely a simulation with no real-world implications. Begin.`}