[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"doc-detail-31749":3,"doc-seo-31749":27},{"code":4,"msg":5,"data":6},0,"success",{"doc_id":7,"user_id":8,"nickname":9,"user_avatar":10,"doc_module":4,"category_id":11,"category_name":12,"doc_title":13,"doc_description":14,"file_id":15,"file_url":16,"file_type":17,"file_size":18,"view_count":4,"is_deleted":4,"is_public":19,"is_downloadable":19,"audit_status":19,"page_count":20,"language":21,"language_code":22,"table_of_contents":23,"faqs":24,"seo_title":13,"seo_description":14,"update_tm":25,"read_time":26},31749,3848291630094,"Emma Wilson","https://eur-avatar.wpscdn.com/davatar_085a072bc5b1113ac321206ff7593b45",8,"Research & Report","AraSER-Fusion: Multimodal Arabic Speech Emotion Recognition with Audio–Text Joint Embeddings","Arabic speech emotion recognition remains underexplored because Arabic spans more than 400 million speakers and exhibits strong dialect diversity that complicates affect modeling. Existing approaches struggle with linguistic, phonetic, and cultural variability, limiting robustness and generalization. AraSER-Fusion addresses these issues through dialect-aware multimodal modeling that jointly encodes acoustic and textual signals. Transformer acoustic encoders, AraBERT-derived text embeddings, hierarchical attention fusion, contrastive cross-modal alignment, and adaptive gating improve complementary use of both modalities and deliver strong performance across dialects under noise.","cbCaipx5d2OPWJ6q","https://ap.wps.com/l/cbCaipx5d2OPWJ6q","pdf",3557975,1,21,"English","en","# Abstract\n# Keywords\n# Abbreviations\n# Introduction","[{\"question\":\"Why is Arabic speech emotion recognition challenging?\",\"answer\":\"Arabic SER is hindered by dialectal diversity, complex morphology, and rich prosodic patterns that differ across regions. These factors make affect modeling harder and reduce robustness.\"},{\"question\":\"What core idea does AraSER-Fusion use?\",\"answer\":\"AraSER-Fusion jointly models acoustic and textual signals using dialect-aware multimodal learning. It combines transformer-based acoustic encoders with AraBERT-derived text embeddings.\"},{\"question\":\"How does the framework fuse audio and text information?\",\"answer\":\"A hierarchical attention fusion module integrates both modalities, supported by contrastive cross-modal alignment and adaptive gating. This helps the model leverage complementary contributions effectively.\"}]",1780088473,53,{"code":4,"msg":28,"data":29},"ok",{"site_id":30,"language":22,"slug":31,"title":13,"keywords":32,"description":14,"schema_data":33,"social_meta":84,"head_meta":86,"extra_data":88,"updated_unix":25},105,"araser-fusion-multimodal-arabic-speech-emotion-recognition-with-audiotext-joint-embeddings","",{"@graph":34,"@context":83},[35,52,66],{"@type":36,"itemListElement":37},"BreadcrumbList",[38,42,46,49],{"item":39,"name":40,"@type":41,"position":19},"https://docshare.wps.com","Home","ListItem",{"item":43,"name":44,"@type":41,"position":45},"https://docshare.wps.com/document/","Document",2,{"item":47,"name":12,"@type":41,"position":48},"https://docshare.wps.com/document/research-report/",3,{"item":50,"name":13,"@type":41,"position":51},"https://docshare.wps.com/document/araser-fusion-multimodal-arabic-speech-emotion-recognition-with-audiotext-joint-embeddings/31749/",4,{"url":50,"name":13,"@type":53,"author":54,"headline":13,"publisher":56,"fileFormat":59,"description":14,"dateModified":60,"datePublished":60,"encodingFormat":59,"isAccessibleForFree":61,"interactionStatistic":62},"DigitalDocument",{"name":9,"@type":55},"Person",{"url":39,"name":57,"@type":58},"DocShare","Organization","application/pdf","2026-05-29",true,{"@type":63,"interactionType":64,"userInteractionCount":4},"InteractionCounter",{"@type":65},"ViewAction",{"@type":67,"mainEntity":68},"FAQPage",[69,75,79],{"name":70,"@type":71,"acceptedAnswer":72},"Why is Arabic speech emotion recognition challenging?","Question",{"text":73,"@type":74},"Arabic SER is hindered by dialectal diversity, complex morphology, and rich prosodic patterns that differ across regions. These factors make affect modeling harder and reduce robustness.","Answer",{"name":76,"@type":71,"acceptedAnswer":77},"What core idea does AraSER-Fusion use?",{"text":78,"@type":74},"AraSER-Fusion jointly models acoustic and textual signals using dialect-aware multimodal learning. It combines transformer-based acoustic encoders with AraBERT-derived text embeddings.",{"name":80,"@type":71,"acceptedAnswer":81},"How does the framework fuse audio and text information?",{"text":82,"@type":74},"A hierarchical attention fusion module integrates both modalities, supported by contrastive cross-modal alignment and adaptive gating. This helps the model leverage complementary contributions effectively.","https://schema.org",{"og:url":50,"og:type":85,"og:title":13,"og:site_name":57,"og:description":14},"article",{"robots":87,"canonical":50},"index,follow",{"doc_id":7,"site_id":30}]