[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"doc-detail-31121":3,"doc-seo-31121":21},{"code":4,"msg":5,"data":6},0,"success",{"doc_id":7,"user_id":8,"nickname":9,"user_avatar":10,"doc_module":4,"category_id":11,"category_name":12,"doc_title":13,"doc_description":14,"file_id":15,"file_url":16,"file_type":17,"file_size":18,"view_count":4,"is_deleted":4,"is_public":19,"is_downloadable":19,"audit_status":19,"update_tm":20},31121,1374391975076,"Riley","https://ap-avatar.wpscdn.com/davatar_994ba38a5ba835b3df7d355c54d3ed8d",8,"Research & Report","Prompting Multimodal Vision-Language Models for Automated Student Engagement Prediction","Accurate student engagement prediction is critical for understanding learners and enabling responsive, personalized education. The work presents EngageCLIP, a vision-language model tailored for engagement assessment that leverages semantic information from label texts rather than relying solely on isolated visual features. EngageCLIP integrates video temporal modeling with cross-frame attention and dual-level video prompts (local frame prompts and global video prompts). A video-aware text prompt learning scheme dynamically generates textual representations aligned with visual content. Experiments on EngageNet and DAiSEE show state-of-the-art results, improving accuracy by 1.8% and 1.18% while balancing efficiency and performance, advancing vision-language contrastive learning for educational analysis.","cbCailFKWKObL9qr","https://ap.wps.com/l/cbCailFKWKObL9qr","pdf",856699,1,1778619772,{"code":4,"msg":22,"data":23},"ok",{"site_id":24,"language":25,"slug":26,"title":13,"keywords":27,"description":14,"schema_data":28,"social_meta":62,"head_meta":64,"extra_data":66,"updated_unix":20},105,"en","prompting-multimodal-vision-language-models-for-automated-student-engagement-prediction","",{"@graph":29,"@context":61},[30,47],{"@type":31,"itemListElement":32},"BreadcrumbList",[33,37,41,44],{"item":34,"name":35,"@type":36,"position":19},"https://docshare.wps.com","Home","ListItem",{"item":38,"name":39,"@type":36,"position":40},"https://docshare.wps.com/document/","Document",2,{"item":42,"name":12,"@type":36,"position":43},"https://docshare.wps.com/document/research-report/",3,{"item":45,"name":13,"@type":36,"position":46},"https://docshare.wps.com/document/prompting-multimodal-vision-language-models-for-automated-student-engagement-prediction/31121",4,{"url":45,"name":13,"@type":48,"author":49,"headline":13,"publisher":51,"fileFormat":54,"description":14,"dateModified":55,"datePublished":55,"encodingFormat":54,"isAccessibleForFree":56,"interactionStatistic":57},"DigitalDocument",{"name":9,"@type":50},"Person",{"url":34,"name":52,"@type":53},"DocShare","Organization","application/pdf","2026-05-12",true,{"@type":58,"interactionType":59,"userInteractionCount":4},"InteractionCounter",{"@type":60},"ViewAction","https://schema.org",{"og:url":45,"og:type":63,"og:title":13,"og:site_name":52,"og:description":14},"article",{"robots":65,"canonical":45},"index,follow",{"doc_id":7,"site_id":24}]