{"_id": {"$oid": "69d4eb12b877594bc5718d48"}, "test_id": "T0862", "benchmark": "company_lists", "date": "2026-03-25", "tags": {"document-type": ["book-page"], "writing": ["printed"], "century": [20], "language": ["en", "de"], "layout": ["list"], "entry-type": ["company"], "task": ["information-extraction"]}, "contributors": ["lea_kasper", "sorin_marti"], "hidden": false, "config": {"provider": "alibaba", "model": "qwen3.5-122b-a10b", "dataclass": "ListPage", "temperature": 0.5, "role_description": "You are a Historian", "prompt_file": "prompt_min.txt", "rules": "{\"base_url\": \"https://ws-wkjpbv6ov7sqlgqe.ap-southeast-1.maas.aliyuncs.com/compatible-mode/v1\"}", "legacy_test": false}, "prompt": "- Answer in valid JSON.\n- The page ID is given as {page_id}.\n", "results": {"text": "{\"page_id\":\"16\",\"entries\":[{\"entry_id\":\"1\",\"company_name\":\"F\u00e4rbereien Schetty, A.-G.\",\"location\":\"Basle\"},{\"entry_id\":\"2\",\"company_name\":\"G. Kappeler A.-G.\",\"location\":\"Zofingen\"},{\"entry_id\":\"3\",\"company_name\":\"J. A. Crabtree & Co., Ltd.\",\"location\":\"Walsall, England\"},{\"entry_id\":\"4\",\"company_name\":\"Brown Boveri & Cie., A.-G.\",\"location\":\"Baden (Switzerland)\"},{\"entry_id\":\"5\",\"company_name\":\"Magn\u00e9tos Lucifer S.A.\",\"location\":\"Carouge-Gen\u00e8ve\"}]}", "model": "qwen3.5-122b-a10b", "provider": "alibaba", "finish_reason": "stop", "usage": {"input_tokens": 2756, "output_tokens": 238, "total_tokens": 2994, "input_cost_usd": 0.0011024000000000001, "output_cost_usd": 0.0007616000000000001, "estimated_cost_usd": 0.0018640000000000002}, "duration": 9.213775157928467, "timestamp": "2026-03-25T10:22:46.924484", "parsed": {"page_id": "16", "entries": [{"entry_id": "1", "company_name": "F\u00e4rbereien Schetty, A.-G.", "location": "Basle"}, {"entry_id": "2", "company_name": "G. Kappeler A.-G.", "location": "Zofingen"}, {"entry_id": "3", "company_name": "J. A. Crabtree & Co., Ltd.", "location": "Walsall, England"}, {"entry_id": "4", "company_name": "Brown Boveri & Cie., A.-G.", "location": "Baden (Switzerland)"}, {"entry_id": "5", "company_name": "Magn\u00e9tos Lucifer S.A.", "location": "Carouge-Gen\u00e8ve"}]}, "conversation_id": "872236fa-9385-4b66-837a-0aaf633c67b8", "score": {"f1_score": 0.3, "precision": 0.5, "recall": 0.21621621621621623, "true_positives": 8, "false_positives": 8, "false_negatives": 29, "field_scores": {"entries[6].entry_id": {"response": "", "ground_truth": "156089_1321092_18-7", "score": 0.0}, "entries[6].location": {"response": "", "ground_truth": "Baden (Switzerland)", "score": 0.0}, "entries[11].location": {"response": "", "ground_truth": "Baden (Switzerland)", "score": 0.0}, "entries[3].location": {"response": "Baden (Switzerland)", "ground_truth": "Baden (Switzerland)", "score": 1.0}, "entries[2].company_name": {"response": "J. A. Crabtree & Co., Ltd.", "ground_truth": "J. A. Crabtree & Co., Ltd.", "score": 1.0}, "entries[3].company_name": {"response": "Brown Boveri & Cie., A.-G.", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 1.0}, "entries[5].company_name": {"response": "", "ground_truth": "Magn\u00e9tos Lucifer S. A.", "score": 0.0}, "entries[6].company_name": {"response": "", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.0}, "entries[4].entry_id": {"response": "5", "ground_truth": "156089_1321092_18-5", "score": 0.09999999999999998}, "entries[0].company_name": {"response": "F\u00e4rbereien Schetty, A.-G.", "ground_truth": "F\u00e4rbereien Schetty, A.-G.", "score": 1.0}, "entries[1].location": {"response": "Zofingen", "ground_truth": "Zofingen", "score": 1.0}, "entries[9].location": {"response": "", "ground_truth": "Baden (Switzerland)", "score": 0.0}, "entries[11].company_name": {"response": "", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.0}, "entries[7].location": {"response": "", "ground_truth": "Baden (Switzerland)", "score": 0.0}, "entries[8].company_name": {"response": "", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.0}, "entries[1].company_name": {"response": "G. Kappeler A.-G.", "ground_truth": "G. Kappeler A.-G.", "score": 1.0}, "entries[3].entry_id": {"response": "4", "ground_truth": "156089_1321092_18-4", "score": 0.09999999999999998}, "entries[9].entry_id": {"response": "", "ground_truth": "156089_1321092_18-10", "score": 0.0}, "entries[2].entry_id": {"response": "3", "ground_truth": "156089_1321092_18-3", "score": 0.09999999999999998}, "entries[9].company_name": {"response": "", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.0}, "entries[10].entry_id": {"response": "", "ground_truth": "156089_1321092_18-11", "score": 0.0}, "entries[10].location": {"response": "", "ground_truth": "Baden (Switzerland)", "score": 0.0}, "entries[7].entry_id": {"response": "", "ground_truth": "156089_1321092_18-8", "score": 0.0}, "entries[0].location": {"response": "Basle", "ground_truth": "Basle", "score": 1.0}, "entries[7].company_name": {"response": "", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.0}, "entries[5].location": {"response": "", "ground_truth": "Carouge-Gen\u00e8ve", "score": 0.0}, "entries[10].company_name": {"response": "", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.0}, "entries[4].location": {"response": "Carouge-Gen\u00e8ve", "ground_truth": "Baden (Switzerland)", "score": 0.24242424242424243}, "entries[5].entry_id": {"response": "", "ground_truth": "156089_1321092_18-6", "score": 0.0}, "entries[8].entry_id": {"response": "", "ground_truth": "156089_1321092_18-9", "score": 0.0}, "entries[8].location": {"response": "", "ground_truth": "Baden (Switzerland)", "score": 0.0}, "entries[11].entry_id": {"response": "", "ground_truth": "156089_1321092_18-12", "score": 0.0}, "entries[2].location": {"response": "Walsall, England", "ground_truth": "Walsall, England", "score": 1.0}, "entries[4].company_name": {"response": "Magn\u00e9tos Lucifer S.A.", "ground_truth": "Brown Boveri & Cie., A.-G.", "score": 0.34042553191489366}, "entries[1].entry_id": {"response": "2", "ground_truth": "156089_1321092_18-2", "score": 0.09999999999999998}, "page_id": {"response": "16", "ground_truth": "156089_1321092_18", "score": 0.21052631578947367}, "entries[0].entry_id": {"response": "1", "ground_truth": "156089_1321092_18-1", "score": 0.09999999999999998}}, "total_fields": 37}, "raw_response": "{\"id\":\"chatcmpl-42a967a5-89b7-96ba-a1fa-2ec2d4c53cb0\",\"choices\":[{\"finish_reason\":\"stop\",\"index\":0,\"logprobs\":null,\"message\":{\"content\":\"{\\n  \\\"page_id\\\": \\\"16\\\",\\n  \\\"entries\\\": [\\n    {\\n      \\\"entry_id\\\": \\\"1\\\",\\n      \\\"company_name\\\": \\\"F\u00e4rbereien Schetty, A.-G.\\\",\\n      \\\"location\\\": \\\"Basle\\\"\\n    },\\n    {\\n      \\\"entry_id\\\": \\\"2\\\",\\n      \\\"company_name\\\": \\\"G. Kappeler A.-G.\\\",\\n      \\\"location\\\": \\\"Zofingen\\\"\\n    },\\n    {\\n      \\\"entry_id\\\": \\\"3\\\",\\n      \\\"company_name\\\": \\\"J. A. Crabtree & Co., Ltd.\\\",\\n      \\\"location\\\": \\\"Walsall, England\\\"\\n    },\\n    {\\n      \\\"entry_id\\\": \\\"4\\\",\\n      \\\"company_name\\\": \\\"Brown Boveri & Cie., A.-G.\\\",\\n      \\\"location\\\": \\\"Baden (Switzerland)\\\"\\n    },\\n    {\\n      \\\"entry_id\\\": \\\"5\\\",\\n      \\\"company_name\\\": \\\"Magn\u00e9tos Lucifer S.A.\\\",\\n      \\\"location\\\": \\\"Carouge-Gen\u00e8ve\\\"\\n    }\\n  ]\\n}\",\"refusal\":null,\"role\":\"assistant\",\"annotations\":null,\"audio\":null,\"function_call\":null,\"tool_calls\":null,\"reasoning_content\":\"\"}}],\"created\":1774430567,\"model\":\"qwen3.5-122b-a10b\",\"object\":\"chat.completion\",\"service_tier\":null,\"system_fingerprint\":null,\"usage\":{\"completion_tokens\":238,\"prompt_tokens\":2756,\"total_tokens\":2994,\"completion_tokens_details\":{\"accepted_prediction_tokens\":null,\"audio_tokens\":null,\"reasoning_tokens\":null,\"rejected_prediction_tokens\":null,\"text_tokens\":238},\"prompt_tokens_details\":{\"audio_tokens\":null,\"cached_tokens\":null,\"image_tokens\":2544,\"text_tokens\":212}}}"}, "scoring": {"f1_micro": 0.4649257552483359, "f1_macro": 0.4686666666666667, "micro_precision": 0.4724245577523413, "micro_recall": 0.4576612903225806, "total_instances": 15, "total_tp": 454, "total_fp": 507, "total_fn": 538, "cost_summary": {"total_input_tokens": 40921, "total_output_tokens": 14857, "total_tokens": 55778, "input_cost_usd": 0.0163684, "output_cost_usd": 0.04754240000000001, "total_cost_usd": 0.06391080000000002}}, "normalized_score": 46.86666666666667}