publications | Dr. Yunpu Ma

2025

METok: Multi-Stage Event-based Token Compression for Efficient Long Video Understanding

Mengyue Wang, Shuo Chen, Kristian Kersting, Volker Tresp, and Yunpu Ma^†

2025

@misc{wang2025metokmultistageeventbasedtoken,
  title = {METok: Multi-Stage Event-based Token Compression for Efficient Long Video Understanding},
  author = {Wang, Mengyue and Chen, Shuo and Kersting, Kristian and Tresp, Volker and Ma, Yunpu},
  year = {2025},
  eprint = {2506.02850},
  archiveprefix = {arXiv},
  primaryclass = {cs.CV},
  url = {https://arxiv.org/abs/2506.02850},
}

Language Mixing in Reasoning Language Models: Patterns, Impact, and Internal Causes

Mingyang Wang, Lukas Lange, Heike Adel, Yunpu Ma, Jannik Strötgen, and Hinrich Schütze

2025

Bib PDF

@misc{wang2025languagemixingreasoninglanguage,
  title = {Language Mixing in Reasoning Language Models: Patterns, Impact, and Internal Causes},
  author = {Wang, Mingyang and Lange, Lukas and Adel, Heike and Ma, Yunpu and Strötgen, Jannik and Schütze, Hinrich},
  year = {2025},
  eprint = {2505.14815},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2505.14815},
}

CoT-Kinetics: A Theoretical Modeling Assessing LRM Reasoning Process

Jinhe Bi, Danqi Yan, Yifan Wang, Wenke Huang, Haokun Chen, Guancheng Wan, Mang Ye, Xun Xiao, Hinrich Schuetze, Volker Tresp, and Yunpu Ma^†

2025

Bib PDF

@misc{bi2025cotkineticstheoreticalmodelingassessing,
  title = {CoT-Kinetics: A Theoretical Modeling Assessing LRM Reasoning Process},
  author = {Bi, Jinhe and Yan, Danqi and Wang, Yifan and Huang, Wenke and Chen, Haokun and Wan, Guancheng and Ye, Mang and Xiao, Xun and Schuetze, Hinrich and Tresp, Volker and Ma, Yunpu},
  year = {2025},
  eprint = {2505.13408},
  archiveprefix = {arXiv},
  primaryclass = {cs.AI},
  url = {https://arxiv.org/abs/2505.13408},
}

Webpilot: A versatile and autonomous multi-agent system for web task execution with strategic exploration

Yao Zhang, Zijian Ma, Yunpu Ma^†, Zhen Han, Yu Wu, and Volker Tresp

In Proceedings of the AAAI Conference on Artificial Intelligence, 2025

Bib PDF

@inproceedings{zhang2025webpilot,
  title = {Webpilot: A versatile and autonomous multi-agent system for web task execution with strategic exploration},
  author = {Zhang, Yao and Ma, Zijian and Ma, Yunpu and Han, Zhen and Wu, Yu and Tresp, Volker},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
  volume = {39},
  number = {22},
  pages = {23378--23386},
  year = {2025},
}

OpenDriveVLA: Towards End-to-end Autonomous Driving with Large Vision Language Action Model

Xingcheng Zhou, Xuyuan Han, Feng Yang, Yunpu Ma, and Alois C. Knoll

2025

Bib PDF Code

@misc{zhou2025opendrivevlaendtoendautonomousdriving,
  title = {OpenDriveVLA: Towards End-to-end Autonomous Driving with Large Vision Language Action Model},
  author = {Zhou, Xingcheng and Han, Xuyuan and Yang, Feng and Ma, Yunpu and Knoll, Alois C.},
  year = {2025},
  eprint = {2503.23463},
  archiveprefix = {arXiv},
  primaryclass = {cs.CV},
  url = {https://arxiv.org/abs/2503.23463},
}

Supposedly Equivalent Facts That Aren’t? Entity Frequency in Pre-training Induces Asymmetry in LLMs

Yuan He, Bailan He, Zifeng Ding, Alisia Lupidi, Yuqicheng Zhu, Shuo Chen, Caiqi Zhang, Jiaoyan Chen, Yunpu Ma, Volker Tresp, and Ian Horrocks

2025

Bib PDF

@misc{he2025supposedlyequivalentfactsarent,
  title = {Supposedly Equivalent Facts That Aren't? Entity Frequency in Pre-training Induces Asymmetry in LLMs},
  author = {He, Yuan and He, Bailan and Ding, Zifeng and Lupidi, Alisia and Zhu, Yuqicheng and Chen, Shuo and Zhang, Caiqi and Chen, Jiaoyan and Ma, Yunpu and Tresp, Volker and Horrocks, Ian},
  year = {2025},
  eprint = {2503.22362},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2503.22362},
}

PRISM: Self-Pruning Intrinsic Selection Method for Training-Free Multimodal Data Selection

Jinhe Bi, Yifan Wang, Danqi Yan, Xun Xiao, Artur Hecker, Volker Tresp, and Yunpu Ma^†

2025

Bib PDF

@misc{bi2025prismselfpruningintrinsicselection,
  title = {PRISM: Self-Pruning Intrinsic Selection Method for Training-Free Multimodal Data Selection},
  author = {Bi, Jinhe and Wang, Yifan and Yan, Danqi and Xiao, Xun and Hecker, Artur and Tresp, Volker and Ma, Yunpu},
  year = {2025},
  eprint = {2502.12119},
  archiveprefix = {arXiv},
  primaryclass = {cs.CV},
  url = {https://arxiv.org/abs/2502.12119},
}

LLaVA Steering: Visual Instruction Tuning with 500x Fewer Parameters through Modality Linear Representation-Steering

Jinhe Bi, Yujun Wang, Haokun Chen, Xun Xiao, Artur Hecker, Volker Tresp, and Yunpu Ma^†

2025

Bib PDF Code

@misc{bi2025llavasteeringvisualinstruction,
  title = {LLaVA Steering: Visual Instruction Tuning with 500x Fewer Parameters through Modality Linear Representation-Steering},
  author = {Bi, Jinhe and Wang, Yujun and Chen, Haokun and Xiao, Xun and Hecker, Artur and Tresp, Volker and Ma, Yunpu},
  year = {2025},
  booktitle = {Annual Meeting of the Association for Computational Linguistics},
  primaryclass = {cs.CV},
  url = {https://arxiv.org/abs/2412.12359},
}

DyGMamba: Efficiently Modeling Long-Term Temporal Dependency on Continuous-Time Dynamic Graphs with State Space Models

Zifeng Ding, Yifeng Li, Yuan He, Antonio Norelli, Jingcheng Wu, Volker Tresp, Michael Bronstein, and Yunpu Ma^†

2025

Bib PDF

@misc{ding2025dygmambaefficientlymodelinglongterm,
  title = {DyGMamba: Efficiently Modeling Long-Term Temporal Dependency on Continuous-Time Dynamic Graphs with State Space Models},
  author = {Ding, Zifeng and Li, Yifeng and He, Yuan and Norelli, Antonio and Wu, Jingcheng and Tresp, Volker and Bronstein, Michael and Ma, Yunpu},
  booktitle = {Transactions on Machine Learning Research},
  year = {2025},
  primaryclass = {cs.LG},
  url = {https://arxiv.org/abs/2408.04713},
}

2024

PERFT: Parameter-Efficient Routed Fine-Tuning for Mixture-of-Expert Model

Yilun Liu^*, Yunpu Ma^*†, Shuo Chen, Zifeng Ding, Bailan He, Zhen Han, and Volker Tresp

2024

Bib PDF

@misc{liu2024perftparameterefficientroutedfinetuning,
  title = {PERFT: Parameter-Efficient Routed Fine-Tuning for Mixture-of-Expert Model},
  author = {Liu, Yilun and Ma, Yunpu and Chen, Shuo and Ding, Zifeng and He, Bailan and Han, Zhen and Tresp, Volker},
  year = {2024},
  eprint = {2411.08212},
  archiveprefix = {arXiv},
  primaryclass = {cs.LG},
  url = {https://arxiv.org/abs/2411.08212},
}

VideoINSTA: Zero-shot Long Video Understanding via Informative Spatial-Temporal Reasoning with LLMs

Ruotong Liao, Max Erler, Huiyu Wang, Guangyao Zhai, Gengyuan Zhang, Yunpu Ma^†, and Volker Tresp

In Findings of the Association for Computational Linguistics: EMNLP 2024, Nov 2024

Abs DOI Bib PDF

In the video-language domain, recent works in leveraging zero-shot Large Language Model-based reasoning for video understanding have become competitive challengers to previous end-to-end models. However, long video understanding presents unique challenges due to the complexity of reasoning over extended timespans, even for zero-shot LLM-based approaches. The challenge of information redundancy in long videos prompts the question of what specific information is essential for large language models (LLMs) and how to leverage them for complex spatial-temporal reasoning in long-form video analysis. We propose a framework VideoINSTA , i.e. INformative Spatial-TemporAl Reasoning for zero-shot long-form video understanding.VideoINSTA contributes (1) a zero-shot framework for long video understanding using LLMs; (2) an event-based temporalreasoning and content-based spatial reasoning approach for LLMs to reason over spatial-temporal information in videos; (3) a self-reflective information reasoning scheme based on information sufficiency and prediction confidence while balancing temporal factors.Our model significantly improves the state-of-the-art on three long video question-answering benchmarks: EgoSchema, NextQA, and IntentQA, and the open question answering dataset ActivityNetQA. Code is released: https://github.com/mayhugotong/VideoINSTA.
@inproceedings{liao-etal-2024-videoinsta, title = {{V}ideo{INSTA}: Zero-shot Long Video Understanding via Informative Spatial-Temporal Reasoning with {LLM}s}, author = {Liao, Ruotong and Erler, Max and Wang, Huiyu and Zhai, Guangyao and Zhang, Gengyuan and Ma, Yunpu and Tresp, Volker}, editor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung}, booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2024}, month = nov, year = {2024}, address = {Miami, Florida, USA}, publisher = {Association for Computational Linguistics}, url = {https://aclanthology.org/2024.findings-emnlp.384/}, doi = {10.18653/v1/2024.findings-emnlp.384}, pages = {6577--6602}, }