{"$schema": "https://c3voc.de/schedule/schema.json", "generator": {"name": "pretalx", "version": "2026.1.1"}, "schedule": {"url": "https://cfp.pydata.org/pydataglobal2025/schedule/", "version": "0.33", "base_url": "https://cfp.pydata.org", "conference": {"acronym": "pydataglobal2025", "title": "PyData Global 2025", "start": "2025-12-09", "end": "2025-12-11", "daysCount": 3, "timeslot_duration": "00:05", "time_zone_name": "UTC", "colors": {"primary": "#459DB9"}, "rooms": [{"name": "Analytics, Visualization & Decision Science", "slug": "4968-analytics-visualization-decision-science", "guid": "8553c48d-1fc6-55d8-9a56-3059e5b6f670", "description": null, "capacity": null}, {"name": "Data Engineering & Infrastructure", "slug": "4969-data-engineering-infrastructure", "guid": "9402c746-fa07-579b-9564-178a7e93d918", "description": null, "capacity": null}, {"name": "Machine Learning & AI", "slug": "4967-machine-learning-ai", "guid": "813fe8cc-a60a-58a2-9947-353e0e788540", "description": null, "capacity": null}, {"name": "General Track", "slug": "4966-general-track", "guid": "151ad6c5-10a7-55a9-9788-c7df316481a3", "description": null, "capacity": null}, {"name": "Live from PyData Boston", "slug": "5126-live-from-pydata-boston", "guid": "42001c7f-64eb-5589-ba47-fb221203c4fa", "description": "Join us live from PyData Boston at the Microsoft NERD Center.", "capacity": null}, {"name": "Impact Scholarship Program", "slug": "5189-impact-scholarship-program", "guid": "7aedc7bf-f31d-5728-a6b6-2f38d843f32a", "description": null, "capacity": null}], "tracks": [{"name": "General Track", "slug": "6091-general-track", "color": "#8B7E74"}, {"name": "Machine Learning & AI", "slug": "6092-machine-learning-ai", "color": "#7C5C6F"}, {"name": "Data Engineering & Infrastructure", "slug": "6093-data-engineering-infrastructure", "color": "#6E5D7B"}, {"name": "Analytics, Visualization & Decision Science", "slug": "6094-analytics-visualization-decision-science", "color": "#6A7D8B"}, {"name": "Live from PyData Boston", "slug": "6719-live-from-pydata-boston", "color": "#1b6f84"}, {"name": "Impact Scholarship Program", "slug": "6720-impact-scholarship-program", "color": "#600c81"}], "days": [{"index": 1, "date": "2025-12-09", "day_start": "2025-12-09T04:00:00+00:00", "day_end": "2025-12-10T03:59:00+00:00", "rooms": {"General Track": [{"guid": "c12bc95e-cbc2-5d77-bb47-6d01688ff6f3", "code": "HCURNN", "id": 78678, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/HCURNN/pydata_P7HFqFA_hLWl0LJ.webp", "date": "2025-12-09T12:00:00+00:00", "start": "12:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78678-python-meets-excel-smarter-workflows-for-analysts-and-data-teams", "url": "https://cfp.pydata.org/pydataglobal2025/talk/HCURNN/", "title": "Python Meets Excel: Smarter Workflows for Analysts and Data Teams", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Python drives modern data workflows, yet Excel remains the lingua franca of business. Many Python-based data teams struggle when the \u201clast mile\u201d of delivery still involves exporting results to Excel for business users. This talk explores practical ways for Python users to automate, scale, and enhance Excel-heavy processes using open-source libraries. \r\nThis talk will help you bridge the gap between code and the business-facing spreadsheet world.\r\nWe will discuss real-world use cases for report generation, batch processing, and dashboard templating, all from a Python-first perspective.", "description": "This talk is designed for Python developers, analysts, and data scientists who routinely interact with Excel-based deliverables in their organization. It focuses on practical workflows that enhance productivity and reproducibility without requiring the audience to write or understand VBA or Excel formulas.\r\nThe session begins by outlining common challenges Python users face when integrating with Excel, then introduces powerful Python tools that offer users seamless Excel file manipulation, specifically pandas, xlsxwriter, and xlwings. \r\nWe will discuss some real-world use cases, such as generating reports, automating dashboards, creating custom functions in Excel and batch processing Excel files at scale.\r\nThe talk concludes with a summary of tools, limitations, and best practices for integrating Python into Excel-centric workflows. This is a conceptual and strategic talk aimed at helping Python professionals work more effectively with Excel natives in the business ecosystem.", "recording_license": "", "do_not_record": false, "persons": [{"code": "H9TKW9", "name": "DR NISHA ARORA", "avatar": "https://cfp.pydata.org/media/avatars/H9TKW9_HlfVeU7.webp", "biography": "Dr. Nisha Arora is a data professional with experience across analytics, data science, reporting automation, storytelling, and applied statistical methods using Python, R, and Excel.\r\nWith a background spanning technical writing, reviewing, and corporate trainings, she focuses on making advanced tools accessible to analysts and non-technical users.\r\nHer work bridges business-facing tools like Excel with scalable, reproducible workflows in Python. She creates accessible, practical learning content and actively contributes to the data community through her trainings, talks, and YouTube channel.\r\nShe is currently working on a book project aimed at helping professionals modernize spreadsheet-based processes through Python.", "public_name": "DR NISHA ARORA", "guid": "0fcba147-91a9-5d4d-b998-9c8fe2f8ee5d", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/H9TKW9/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/HCURNN/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/HCURNN/", "attachments": []}, {"guid": "ea1b99cf-3d9f-569e-b957-de2a591addc9", "code": "GPFCXZ", "id": 78662, "logo": null, "date": "2025-12-09T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78662-python-beyond-the-code-unlocking-hidden-contributions-in-open-source", "url": "https://cfp.pydata.org/pydataglobal2025/talk/GPFCXZ/", "title": "Python Beyond the Code: Unlocking Hidden Contributions in Open Source", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Contributing to open source isn\u2019t just about code. Documentation, testing, community support, and issue triaging are critical but often overlooked. In this talk, I\u2019ll share how Python developers \u2014 from junior to senior can make a meaningful, visible impact in open source. Whether you're new to open source or looking to expand your profile, this session will help you discover practical, beginner-friendly ways to contribute and stay engaged in the long term.", "description": "Open-source projects thrive on contributions, but those contributions don\u2019t always come in the form of pull requests. In the Python community, roles such as documentation writing, bug reproduction, testing, onboarding, user feedback, and project coordination are vital to long-term sustainability.\r\n\r\nThis talk aims to dispel the myth that only seasoned developers or prolific coders can contribute meaningfully to open-source projects. Through real-world examples and lessons from my own experience working with Python-based open-source communities, I\u2019ll walk the audience through practical paths for getting involved \u2014 even if you're just starting or come from a non-traditional background like product, design, or DevRel.\r\n\r\nThe session will outline the different ways contributions are recognized in the Python ecosystem, including the impact of GitHub discussions, contributing guides, documentation standards like reStructuredText or Markdown, and the importance of clear communication with maintainers.\r\n\r\nExpected audience: Python developers, career switchers, junior engineers, community managers, and anyone curious about participating in open source.\r\n\r\nTakeaway: You'll leave with an actionable roadmap to contribute beyond code and understand how to track and present your work to peers, employers, and the broader Python community.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DJE38C", "name": "Iyanu Falaye", "avatar": "https://cfp.pydata.org/media/avatars/DJE38C_9CBdOuA.webp", "biography": "Iyanu Falaye is a software engineer and product strategist with a passion for open-source communities and developer enablement. With experience spanning engineering operations, product development, and cross-functional collaboration, he actively supports inclusive contribution models beyond just code. Iyanu has spoken at community tech events and facilitated team knowledge-sharing sessions, always focused on helping others grow their impact in the tech ecosystem.", "public_name": "Iyanu Falaye", "guid": "02239391-88e9-5dc0-9f47-5fe27cd425fe", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DJE38C/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/GPFCXZ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/GPFCXZ/", "attachments": []}, {"guid": "bc54cd2b-876f-581a-a173-833d12defd38", "code": "QMUABM", "id": 78576, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/QMUABM/FF_vcePwvh.png", "date": "2025-12-09T13:30:00+00:00", "start": "13:30", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78576-open-source-models-security-adversarial-attacks-poisoning-sponge", "url": "https://cfp.pydata.org/pydataglobal2025/talk/QMUABM/", "title": "Open Source Models' Security- Adversarial attacks, Poisoning & Sponge", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "The use of open-source models is rapidly increasing. According to Gartner, during the Magnetic Era, their adoption is expected to triple compared to foundational models. However, this rise in usage also brings heightened cybersecurity risks. In this lecture, we will explore the unique vulnerabilities associated with open-source models, the algorithmic techniques used to exploit them, and how our startup is addressing these challenges.", "description": "In my lecture, I will discuss various methods for attacking machine learning models, including model poisoning, DDoS-style attacks, and the generation of adversarial examples\u2014such as Projected Gradient Descent (PGD), Carlini-Wagner attacks, and others. We will also present defense strategies that are data-agnostic and focus on model-driven approaches to protecting AI systems, particularly those that use open-source models. We will also discuss the differentiation between protecting open-source models and regular LLM (what we are not OWASP LLM)", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZKSPAE", "name": "Natan Katz", "avatar": "https://cfp.pydata.org/media/avatars/ZKSPAE_usoTBfG.webp", "biography": "Natan Katz is the co-founder of LuminAI, a startup pioneering statistical red teaming \u2014 a method for testing and securing white-box AI models through statistical and geometric analysis of model activations. At LuminAI, he develops techniques to detect and defend against optimization-based adversarial attacks such as PGD, DeepFool, and Carlini\u2013Wagner, helping organizations build safer and more trustworthy AI systems.\r\n\r\nBefore founding LuminAI, Natan worked across diverse applied domains \u2014 from quantitative modeling and speech analysis to customer journey optimization and biometrics \u2014 bridging theory and practice across industries. He has also published work on AI for Ethereum ecosystems and AI ethics. Natan holds an M.Sc. in Nonlinear Dynamics from the Weizmann Institute of Science, where he studied dynamic models for malignant tissues.", "public_name": "Natan Katz", "guid": "567a3bf6-89ca-5482-8946-4edcfd0ada9a", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/ZKSPAE/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/QMUABM/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/QMUABM/", "attachments": []}, {"guid": "4731474e-362f-55b0-ad39-95481b9e0cec", "code": "FHSZP7", "id": 84929, "logo": null, "date": "2025-12-09T14:00:00+00:00", "start": "14:00", "duration": "01:00", "room": "General Track", "slug": "pydataglobal2025-84929-opening-notes-keynote-by-isabel-zimmerman", "url": "https://cfp.pydata.org/pydataglobal2025/talk/FHSZP7/", "title": "Opening Notes & Keynote by Isabel Zimmerman", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Keynote by Isabel Zimmerman", "description": "Isabel is a Senior Software Engineer at Posit, PBC.", "recording_license": "", "do_not_record": false, "persons": [], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/FHSZP7/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/FHSZP7/", "attachments": []}, {"guid": "d7d4f331-caa3-56de-aae1-f7ca3a792388", "code": "TTDNXY", "id": 78440, "logo": null, "date": "2025-12-09T15:30:00+00:00", "start": "15:30", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78440-python-worst-practices-learn-from-the-expert", "url": "https://cfp.pydata.org/pydataglobal2025/talk/TTDNXY/", "title": "Python Worst Practices: Learn from the Expert", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Data and Analytics Comedian Evan Wimpey is here to roast his own codebase! Enjoy the walk through of the worst Python habits. In this talk, you'll get to see:\r\n* Incomprehensible variable names\r\n* final_final_2.ipynb files\r\n* rerunning the same cell and hoping it works this time\r\n* imports that are never used\r\n* debugging with print\r\n* ML models that are validated on training data\r\n* code so poorly written that even ChatGPT can't understand it\r\n* and more!", "description": "This is meant to be comedy, but the best jokes always include a little bit of truth. Evan will share some hilarious stories of Python gone wrong, complete with code examples. That is, if the code even runs.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UUXFSA", "name": "Evan Wimpey", "avatar": "https://cfp.pydata.org/media/avatars/UUXFSA_kUBl73z.webp", "biography": "Evan Wimpey is an analytics professional turned stand-up comedian, delivering smart, custom comedy. Whether you're hosting a tech offsite, academic event, or a product team that just needs a laugh, Evan tailors content that resonates with your audience.", "public_name": "Evan Wimpey", "guid": "16000b7b-a2c6-53e9-a937-cef4a9155fc0", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/UUXFSA/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/TTDNXY/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/TTDNXY/", "attachments": []}, {"guid": "b0e2437b-9031-5da0-a978-e086fad5afd7", "code": "QHTA73", "id": 78865, "logo": null, "date": "2025-12-09T16:00:00+00:00", "start": "16:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78865-text-mining-orkut-s-community-data-with-python-cultural-memory-platform-neglect-and-digital-amnesia", "url": "https://cfp.pydata.org/pydataglobal2025/talk/QHTA73/", "title": "Text Mining Orkut\u2019s Community Data with Python: Cultural Memory, Platform Neglect, and Digital Amnesia", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "*Orkut* was once the emotional and cultural core of Brazil\u2019s internet. Its scraps, testimonials, and communities gave users a way to publicly shape identity, build relationships, and engage with everything from music and religion to politics and humor. When Google shut it down in 2014, most of its data was deleted. What remains today is fragmented and buried in the Wayback Machine.\r\n\r\nIn this talk, I use Python to recover and analyze limited traces of *Orkut*\u2019s digital legacy. I scraped thousands of community names from archived HTML using `requests` and `BeautifulSoup`, processed them with multilingual sentence embeddings from `sentence-transformers`, and applied `scikit-learn` and `BERTopic` to cluster the data, surface major social themes, and quantify them. These techniques reveal how users created meaning, formed subcultures, and expressed identity through online interactions.\r\n\r\nAlongside the technical walkthrough, I draw on Cory Doctorow\u2019s concept of *enshittification*, defined as the slow decline of platforms as they shift from serving users to exploiting them. *Orkut* is a case of *enshittification* by neglect: its shutdown led not just to the death of a platform, but to the erasure of a generation\u2019s digital memory. According to Google's farewell announcement, over its 10 years of existence, *Orkut* hosted 51 million communities, 120 million discussion topics, and more than 1 billion interactions; most of which were permanently deleted.\r\n\r\nThis talk is for Python users interested not only in working with social media text data but also in uncovering the cultural narratives embedded within it. It invites the audience to see datasets as more than technical artifacts, viewing them instead as living records of online social life.", "description": "This talk explores how Python can be used to recover and analyze digital traces from a platform that once defined Brazil\u2019s online culture. *Orkut*, active from 2004 to 2014, hosted millions of communities where users expressed identity, humor, politics, and emotion in public and often poetic ways. When the platform was shut down, nearly all of this user-generated data was deleted. Today, only fragmented pieces remain, preserved in the Wayback Machine.\r\n\r\nI present a data analysis project that extracts and categorizes *Orkut* community names using open-source Python tools. I use `requests` and `BeautifulSoup` to scrape data from archived HTML snapshots. I then apply multilingual sentence embeddings from the `sentence-transformers` library to generate vector representations of the text, followed by clustering techniques using `scikit-learn` and `BERTopic` to uncover and quantify recurring social themes.\r\n\r\nThis technical walkthrough is grounded in a sociological lens. I draw on Cory Doctorow\u2019s concept of *enshittification*, which describes how platforms degrade as they prioritize value extraction over user experience. *Orkut*'s case illustrates how platform neglect can result not only in product death but also in large-scale cultural erasure. By treating community names as social artifacts, I show how data science can help recover forgotten histories and highlight overlooked communities at the intersection of digital humanities, memorialization, and cultural heritage.\r\n\r\nAttendees will gain practical skills in web scraping, multilingual NLP, and unsupervised clustering. The talk also raises broader questions about data loss, platform decay, and the ethical role of data scientists, software engineers, and tech workers in preserving digital memory.\r\n\r\nNo advanced data science, scraping, text mining, or NLP knowledge is required. The talk is best suited for data scientists and Python developers interested in working with real-world social data and approaching datasets with both technical rigor and cultural sensitivity. Regardless of background, this talk is accessible to anyone interested in data science, NLP, and text mining.\r\n\r\n**Time Breakdown (30 min)**\r\n| **Time**  | **Section**                                                          |\r\n| --------- | ------------------------------------------------------------------------------------- |\r\n| 0\u20134 min   | Introduction to *Orkut* and its cultural role in Brazil and in the Global South                |\r\n| 4\u20137 min   | Platform shutdown, data loss, digital memory and neglect         |\r\n| 7\u201310 min  | Project overview: goals, ethical framing, and data source (Wayback) |\r\n| 10\u201315 min | Scraping with `requests` and `BeautifulSoup` from archived HTML      |\r\n| 15\u201320 min | Processing: multilingual embeddings with `sentence-transformers`     |\r\n| 20\u201323 min | Clustering and theme discovery using `scikit-learn` and `BERTopic`   |\r\n| 23\u201326 min | Insights: social themes, quantification, and what topic categories mattered to users   |\r\n| 26\u201329 min | Reflection: *enshittification*, data loss, and cultural preservation   |\r\n| 29\u201330 min | Final remarks and invitation to rethink data as memory + Q\\&A        |\r\n\r\n**Additional remarks:**\r\n1) A GitHub repository containing the scraping scripts, archived HTML files, datasets, and analysis will be shared with attendees.\r\n2) This project was inspired by both personal nostalgia and frustration over the loss of access to my *Orkut*'s profile, photos, testimonials, and communities.\r\n3) Besides its overwhelming popularity in Brazil, *Orkut* also had a strong foothold in other countries across the Global South, such as India and China, reflecting its **broader appeal beyond the English-speaking tech centers**, typically prioritized in platform histories. This context would makes the talk proposal herein outlined even more interesting and compelling for a **PyData Global audience**.\r\n\r\n| Country         | Traffic on Mar 31, 2004 | Traffic on Sep 30, 2014 |\r\n|----------------|--------------------------|--------------------------|\r\n| Brazil         | 5.16%                    | 55.5%                    |\r\n| United States  | 51.36%                   | 3.3%                     |\r\n| India          | \u2014                        | 18.4%                    |\r\n| China          | \u2014                        | 6.4%                     |\r\n| Japan          | 7.74%                    | 2.7%                     |\r\n| Netherlands    | 4.10%                    | \u2014                        |\r\n| United Kingdom | 3.72%                    | \u2014                        |\r\n| Other          | 27.92%                   | 15.7%                    |\r\nReference: https://web.archive.org/web/20140109153358/http://www.alexa.com/siteinfo/orkut.com.br", "recording_license": "", "do_not_record": false, "persons": [{"code": "UYRBHX", "name": "Rodrigo Silva Ferreira", "avatar": "https://cfp.pydata.org/media/avatars/UYRBHX_h8grDy9.webp", "biography": "Rodrigo Silva Ferreira is a QA Engineer at *Posit*, where he contributes to the quality and usability of open-source tools that empower data scientists working in *R* and *Python*. He focuses on both manual and automated testing strategies to ensure reliability, performance, and an excellent user experience.\r\n\r\nRodrigo holds a BSc. in Chemistry with minors in Applied Math and Arabic from NYU Abu Dhabi and a MSc. in Analytical Chemistry from the University of Pittsburgh. Multilingual and globally minded, he enjoys working at the intersection of data, science, and technology \u2014 especially when it means building tools that help people better understand and navigate the world through its increasingly complex data.", "public_name": "Rodrigo Silva Ferreira", "guid": "49ef2195-55d3-55a9-800f-c903e59f1bc4", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/UYRBHX/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/QHTA73/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/QHTA73/", "attachments": []}, {"guid": "71f55466-1ff6-534e-a41a-6c9c371367f4", "code": "FSTP8H", "id": 79202, "logo": null, "date": "2025-12-09T16:30:00+00:00", "start": "16:30", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-79202-why-julia-s-gpu-accelerated-ode-solvers-are-20x-100x-faster-than-jax-and-pytorch", "url": "https://cfp.pydata.org/pydataglobal2025/talk/FSTP8H/", "title": "Why Julia's GPU-Accelerated ODE Solvers are 20x-100x Faster than Jax and PyTorch", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "You may have seen the benchmark results and thought, \"how the heck are the Julia ODE solvers on GPUs orders of magnitude faster than the GPU-accelerated Python libraries, that can't be true?\" In this talk I will go into detail about the architectural differences between the Julia approaches to generating GPU-accelerated solvers vs the standard ML library approach to GPU usage. By the end of the talk you'll have a good enough understanding of models of GPU acceleration to understand why this performance difference exists, and the many applications that can take advantage of this performance improvement.", "description": "This talk is about the results of the publication titled \"Automated translation and accelerated solving of differential equations on multiple GPU platforms\" which was published in 2024 demonstrating that the Julia GPU-based ODE solvers, specifically DiffEqGPU.jl, are 20x-100x faster than Jax (diffrax) and PyTorch (torchdiffeq). The publication goes into detail as to the architectural reasons for the performance difference, even going as far as recreating the ML style of GPU acceleration in Julia in order to demonstrate that such an approach loses the performance advantage, along with testing against alternative CUDA C++ implementations of a similar form to showcase exactly the effects of the architectural decisions on the resulting performance. However, as a highly technical article it can many times not be as easy to understand as it should. In this talk we're going to give a barebones \"no HPC background required\" explanation of how the Julia GPU stack enables a completely different approach from the \"standard\" ML libraries form of GPU acceleration, and how for some applications this can be majorly beneficial. We will note that the GPU design of the ML libraries is actually optimal for ML applications, but certain properties of some applications of ODE solvers make it require a completely different formulation.\r\n\r\nWe will additionally talk about other projects which have seen similar results, such as solving nonlinear systems in Julia (with NonlinearSolve.jl), GPU-accelerated optimization with Optimization.jl, and new global optimizer methods in ParallelParticleSwarms.jl which all rely on this technique and the special aspects of the Julia GPU infrastructure.\r\n\r\n[1] https://www.sciencedirect.com/science/article/abs/pii/S0045782523007156", "recording_license": "", "do_not_record": false, "persons": [{"code": "WUWQQ3", "name": "Chris Rackauckas", "avatar": "https://cfp.pydata.org/media/avatars/WUWQQ3_otHw1Wk.webp", "biography": "Dr. Chris Rackauckas is the VP of Modeling and Simulation at JuliaHub, the Director of Scientific Research at Pumas-AI, Co-PI of the Julia Lab at MIT, and the lead developer of the SciML Open Source Software Organization. For his work in mechanistic machine learning, his work is credited for the 15,000x acceleration of NASA Launch Services simulations and recently demonstrated a 60x-570x acceleration over Modelica tools in HVAC simulation, earning Chris the US Air Force Artificial Intelligence Accelerator Scientific Excellence Award. See more at https://chrisrackauckas.com/. He is the lead developer of the Pumas project and received a top presentation award at every ACoP from 2019-2021 for improving methods for uncertainty quantification, automated GPU acceleration of nonlinear mixed effects modeling (NLME), and machine learning assisted construction of NLME models with DeepNLME. For these achievements, Chris received the Emerging Scientist award from ISoP.", "public_name": "Chris Rackauckas", "guid": "6ab02490-d781-5300-94af-5f4ffbefbdff", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/WUWQQ3/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/FSTP8H/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/FSTP8H/", "attachments": []}, {"guid": "23c5ebc2-872f-5f48-82f6-f00d131b8f08", "code": "93KHNT", "id": 79374, "logo": null, "date": "2025-12-09T17:30:00+00:00", "start": "17:30", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-79374-bridging-interactive-data-science-and-big-data-with-hybrid-execution", "url": "https://cfp.pydata.org/pydataglobal2025/talk/93KHNT/", "title": "Bridging Interactive Data Science and Big Data with Hybrid Execution", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Hybrid Execution is a new capability introduced in the open-source Modin library that lets developers write familiar pandas code while automatically selecting the most efficient execution backend. Small datasets run locally for fast, interactive development, while larger workloads are transparently pushed down to distributed backends for scalable, high-performance execution. This approach enables faster development for rapid prototyping and iteration and future-proofs pipelines as data volumes grow.", "description": "pandas is one of the most widely used tools in the Python ecosystem, but scaling it beyond memory limits has traditionally required significant refactoring or switching to other tools. In this talk, we introduce Hybrid Execution, a new capability powered by Modin that allows pandas code to seamlessly switch between local, in-memory execution and distributed backends. This approach preserves the familiar pandas API while enabling users to scale their workflows without rewriting code. We'll explore how Hybrid Execution works under the hood, how Modin enables backend flexibility, and what it means for building interactive, scalable data pipelines with pandas.", "recording_license": "", "do_not_record": false, "persons": [{"code": "SZHAJX", "name": "Jonathan Shi", "avatar": "https://cfp.pydata.org/media/avatars/SZHAJX_MNMqiaL.webp", "biography": "Jonathan is a software engineer on Snowflake's Snowpark Python team, and is a maintainer of the Modin project. He enjoys building systems that are usable, maintainable, and performant.", "public_name": "Jonathan Shi", "guid": "c9f397e5-6092-56ff-b9d9-409776337ba1", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/SZHAJX/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/93KHNT/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/93KHNT/", "attachments": []}, {"guid": "6db4a9ad-d41b-500b-9ad0-2d3066b7fafe", "code": "JCXBBW", "id": 78522, "logo": null, "date": "2025-12-09T18:00:00+00:00", "start": "18:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78522-projspec-what-s-this-project-anyway", "url": "https://cfp.pydata.org/pydataglobal2025/talk/JCXBBW/", "title": "projspec: what's this project anyway?", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Most code and related workflows take place in \"projects\", directories with descriptive metadata. There are so many types of these around these days, it is hard to know what is contained where. projspec solves this for the majority of the python-data ecosystem, so that you can introspect your projects, act on them, and search across all your projects, local or remote.", "description": "Daily workflows in pydata usually occur in the context of projects - a directory tree of stuff, with special metadata files describing those contents. Many metadata specifications are in use for each of the many tools that operate on projects, storing information in small yaml, toml or json files, or in the pyproject.toml file for python-specific projects. This model encompasses not only the majority of the environment management tools and task runners in pydata (uv, pixi, poetry, etc) but other essential tools (e.g., git), definitions (e.g., hugging-face dataset), deployment (briefcase, helm, wheel) and workflow-specific metadata (e.g., pyscript). \r\n\r\nThe range of possible metadata is bewildering! Most projects show how to invoke their functionality in README files, with the first step downloading some specific tool. In some way, all this flexibility has taken us backwards. There is no easy way to tell what type a project is and what definitions it contains without  reading the supporting documentation and browsing specific files, or even downloading the whole thing and running a specific tool against it.\r\n\r\nprojspec aspires to be a layer over the most common pydata related project types. It provides introspection of project type and contents from the metadata definitions, and this can be done on remote project directories too. For each project type, we infer a set of \"contents\" (things that are defined in the project and inherently part of it) and \"artifacts\" (things the project can make or do, usually by calling a subprocess). A project can be multiple types at once: a project designed to be executed with pixi, for instance, still likely contains git information and may also have dataset declarations, things that pixi is not concerned with. Projects may also contain sub-projects of the same or different type, e.g., a conda recipe alongside a code library.\r\n\r\nProjspec, due to be released in time for this talk, will provide a handy API to work with projects of many types, including introspection and effecting actions. It will have a way to index many projects locally or remotely, to allow for querying with complex criteria, to find the project that matches your needs - contains certain datasets, depends on specific library/versions or is capable of creating particular output types. We will demonstrate all of this!", "recording_license": "", "do_not_record": false, "persons": [{"code": "MT93ZS", "name": "Martin Durant", "avatar": null, "biography": null, "public_name": "Martin Durant", "guid": "954c2cf3-e966-57f3-9763-f215cd7d9c14", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/MT93ZS/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/JCXBBW/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/JCXBBW/", "attachments": []}, {"guid": "57b90fdf-9ae7-5581-9a98-f8c9786b1873", "code": "JVJZFT", "id": 84935, "logo": null, "date": "2025-12-09T18:30:00+00:00", "start": "18:30", "duration": "00:45", "room": "General Track", "slug": "pydataglobal2025-84935-keynote-by-lisa-amini-what-s-next-in-ai-for-data-and-data-management", "url": "https://cfp.pydata.org/pydataglobal2025/talk/JVJZFT/", "title": "Keynote by Lisa Amini- What\u2019s Next in AI for Data and Data Management?", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Advances in large language models (LLMs) have propelled a recent flurry of AI tools for data management and operations.  For example, AI-powered code assistants leverage LLMs to generate code for dataflow pipelines. RAG pipelines enable LLMs to ground responses with relevant information from external data sources. Data agents leverage LLMs to turn natural language questions into data-driven answers and actions.  While challenges remain, these advances are opening exciting new opportunities for data scientists and engineers.  In this talk, we will examine recent advances, along with some still incubating in research labs, with the goal of understanding where this is all heading, and present our perspective on what\u2019s next for AI in data management and data operations.", "description": "Dr. Lisa Amini leads IBM's Data & AI Platforms Research efforts globally, along with IBM's AI Horizons Network. She is also an IBM Distinguished Engineer (DE). The mission of the Data & AI Platforms Research theme is to infuse generative and agentic AI throughout IBM's Data Platform, to make it more intelligent, self-service, and autonomous, and to optimize its performance on AI workloads.", "recording_license": "", "do_not_record": false, "persons": [], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/JVJZFT/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/JVJZFT/", "attachments": []}], "Machine Learning & AI": [{"guid": "e01c956e-ddf4-5981-91cc-0d1aef9e4d68", "code": "QREPPX", "id": 78637, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/QREPPX/genai-sdlc_neLT9g1.png", "date": "2025-12-09T10:30:00+00:00", "start": "10:30", "duration": "01:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78637-building-llm-powered-applications-for-data-scientists-and-software-engineers", "url": "https://cfp.pydata.org/pydataglobal2025/talk/QREPPX/", "title": "Building LLM-Powered Applications for Data Scientists and Software Engineers", "subtitle": "", "track": "Machine Learning & AI", "type": "Tutorial", "language": "en", "abstract": "This workshop is designed to equip software engineers with the skills to build and iterate on generative AI-powered applications. Participants will explore key components of the AI software development lifecycle through first principles thinking, including prompt engineering, monitoring, evaluations, and handling non-determinism. The session focuses on using multimodal AI models to build applications, such as querying PDFs, while providing insights into the engineering challenges unique to AI systems. By the end of the workshop, participants will know how to build a PDF-querying app, but all techniques learned will be generalizable for building a variety of generative AI applications.\r\n\r\nIf you're a data scientist, machine learning practitioner, or AI enthusiast, this workshop can also be valuable for learning about the software engineering aspects of AI applications, such as lifecycle management, iterative development, and monitoring, which are critical for production-level AI systems.", "description": "This workshop is designed to equip software engineers with the skills to build and iterate on generative AI-powered applications. Participants will explore key components of the AI software development lifecycle through first principles thinking, including prompt engineering, monitoring, evaluations, and handling non-determinism. The session focuses on using LLMs to build applications, such as querying PDFs, while providing insights into the engineering challenges unique to AI systems. By the end of the workshop, participants will know how to build a PDF-querying app, but all techniques learned will be generalizable for building a variety of generative AI applications.\r\n\r\nIf you're a data scientist, machine learning practitioner, or AI enthusiast, this workshop can also be valuable for learning about the software engineering aspects of AI applications, such as lifecycle management, iterative development, and monitoring, which are critical for production-level AI systems.\r\n\r\n**What You'll Learn:**\r\n\r\n* How to integrate AI models and APIs into a practical application.\r\n* Techniques to manage non-determinism and optimize outputs through prompt engineering.\r\n* How to monitor, log, and evaluate AI systems to ensure reliability.\r\n* The importance of handling structured outputs and using function calling in AI models.\r\n* The software engineering side of building AI systems, including iterative development, debugging, and performance monitoring.\r\n* Practical experience in building an app to query PDFs using multimodal models.\r\n\r\n**What is Unique About This Session:**\r\n\r\nThis workshop uniquely bridges the gap between software engineering and generative AI development. While most AI workshops focus solely on model usage or tuning, this session emphasizes the entire AI software lifecycle \u2014 from prompt engineering to monitoring and tracing. Participants will learn how to manage non-determinism and create production-ready AI applications, giving them the knowledge to tackle the software engineering challenges of AI-powered apps. The hands-on approach ensures that attendees walk away with practical skills and a functional app.\r\n\r\n**Workshop Prerequisite Knowledge:**\r\n* Basic programming knowledge in Python.\r\n* Familiarity with REST APIs.\r\n* Experience working with Jupyter Notebooks or similar environments (preferred but not required).\r\n* No prior experience with AI or machine learning is required.\r\n* Most importantly, a sense of curiosity and a desire to learn!\r\n\r\nIf you have a background in data science, ML, or AI, this workshop will help you understand the software engineering side of building AI applications.\r\n\r\nWe will introduce you to certain modern frameworks in the workshop but the emphasis be on first principles and using vanilla Python and LLM calls to build AI-powered systems.\r\n\r\n[All tutorial material will be in this github repository](https://github.com/hugobowne/AI-for-SWEs).", "recording_license": "", "do_not_record": false, "persons": [{"code": "CDKDS9", "name": "hugo bowne-anderson", "avatar": "https://cfp.pydata.org/media/avatars/CDKDS9_CQ04pNQ.webp", "biography": "Hugo Bowne-Anderson is an independent data and AI consultant with extensive experience in the tech industry. He is the host of the industry [Vanishing Gradients](https://vanishinggradients.fireside.fm/), where he explores cutting-edge developments in data science and artificial intelligence.\r\nAs a data scientist, educator, evangelist, content marketer, and strategist, Hugo has worked with leading companies in the field. His past roles include Head of Developer Relations at Outerbounds, a company committed to building infrastructure for machine learning applications, and positions at Coiled and DataCamp, where he focused on scaling data science and online education respectively.\r\nHugo's teaching experience spans from institutions like Yale University and Cold Spring Harbor Laboratory to conferences such as SciPy, PyCon, and ODSC. He has also worked with organizations like Data Carpentry to promote data literacy.\r\nHis impact on data science education is significant, having developed over 30 courses on the DataCamp platform that have reached more than 3 million learners worldwide. Hugo also created and hosted the popular weekly data industry podcast DataFramed for two years.\r\nCommitted to democratizing data skills and access to data science tools, Hugo advocates for open source software both for individuals and enterprises.", "public_name": "hugo bowne-anderson", "guid": "d66a55c0-508d-5283-a06b-998283286370", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/CDKDS9/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/QREPPX/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/QREPPX/", "attachments": []}, {"guid": "adfdcbf2-bfb1-5f4a-8380-9c21960bc54d", "code": "BQLTSH", "id": 78399, "logo": null, "date": "2025-12-09T12:00:00+00:00", "start": "12:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78399-when-ai-makes-things-up-understanding-and-tackling-hallucinations", "url": "https://cfp.pydata.org/pydataglobal2025/talk/BQLTSH/", "title": "When AI Makes Things Up: Understanding and Tackling Hallucinations", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "AI systems are increasingly being integrated into real-world products - from chatbots and search engines to summarisation tools and coding assistants. Yet, despite their fluency, these models can produce confident but false or misleading information, a phenomenon known as **hallucination**. In production settings, such errors can erode user trust, misinform decisions, and introduce serious risks. This talk unpacks the root causes of hallucinations, explores their impact on various applications, and highlights emerging techniques to detect and mitigate them. With a focus on practical strategies, the session offers guidance for building more trustworthy AI systems fit for deployment.", "description": "This session will unpack the problem of AI hallucination - not just what it is, but how it surfaces in everyday use. We\u2019ll look at the common causes, ranging from incomplete context to over-generalisation, and walk through detection and prevention techniques such as grounding, prompt design and RAG. Whether you\u2019re building AI products or evaluating outputs, this talk will give you the tools to recognise hallucinations and reduce their risk.\r\n\r\n##### Outline:\r\n* Introduction to hallucinations in LLMs\r\n* Common causes behind hallucinated outputs\r\n* Impact on production applications\r\n* Techniques for detecting and evaluating hallucinations\r\n* Strategies to reduce hallucinations\r\n* Best practices for building trustworthy AI products\r\n* Key takeaways\r\n\r\n##### Background Knowledge Required:\r\nBeginner-friendly - no prior knowledge needed. Familiarity with LLMs is a plus but not necessary.", "recording_license": "", "do_not_record": false, "persons": [{"code": "SFVFTF", "name": "Aarti Jha", "avatar": "https://cfp.pydata.org/media/avatars/SFVFTF_QOJp9Pp.webp", "biography": "Aarti Jha is a Principal Data Scientist at Red Hat, where she develops AI-driven solutions to streamline internal processes and reduce operational costs. She brings over 6.5 years of experience in building and deploying data science and machine learning solutions across industry domains. She is an active public speaker and frequently presents at developer and data-science conferences, focusing on practical approaches to applied AI and LLMs.", "public_name": "Aarti Jha", "guid": "bdb6953a-cbfd-5f15-95a6-2b128c163ad4", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/SFVFTF/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/BQLTSH/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/BQLTSH/", "attachments": [{"title": "PyData Global Slide Deck", "url": "/media/pydataglobal2025/submissions/BQLTSH/resources/PyData_G_JfsAxBV.pptx", "type": "related"}]}, {"guid": "0abeaa90-584d-5f30-be25-627587375de9", "code": "JGSYEP", "id": 78661, "logo": null, "date": "2025-12-09T12:30:00+00:00", "start": "12:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78661-torchtextclassifiers-modernizing-text-classification-for-french-national-statistics", "url": "https://cfp.pydata.org/pydataglobal2025/talk/JGSYEP/", "title": "torchTextClassifiers : Modernizing Text classification for French National Statistics", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Discover how Insee (French National Statistics Institute) transitioned from fastText to a PyTorch-based model for text classification by developing and open-sourcing the torchTextClassifiers  python package. This presentation will cover the creation, deployment, and practical applications of torchTextClassifiers in modernizing automatic coding systems, benefiting Insee and other European National Statistical Institutes (NSIs).", "description": "Insee, France's National Institute of Statistics and Economic Studies, has long relied on fastText for automatic coding tasks. Recognizing the need to modernize and future-proof this critical functionality, we developed torchTextClassifiers \u2014 an open-source Python package that enables easy training and deployment of a PyTorch-based model for text classification, paving the way for further innovation in this domain.\r\n\r\nThis session will delve into the motivations behind replacing the archived fastText package, the design and implementation of torchTextClassifiers , and its integration into Insee's production environment. We'll discuss the challenges faced during this transition, including model compatibility, performance optimization, and user adoption.\u200b\r\n\r\nAttendees will gain insights into:\u200b\r\n\r\n- The rationale for moving from fastText to a PyTorch-based model\u200b in production\r\n\r\n- Packaging a PyTorch-based model architecture and open-source collaboration\r\n\r\n- Key features and architecture of torchTextClassifiers \u200b\r\n\r\n- Deployment strategies within a public administration (MLOps, cloud native tools, security)\r\n\r\n- Lessons learned and best practices for similar transitions\u200b\r\n\r\nThis talk is intended for data scientists, machine learning engineers, and practitioners interested in NLP, model deployment, and open-source tool development.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WULQUH", "name": "Meilame Tayebjee", "avatar": "https://cfp.pydata.org/media/avatars/WULQUH_6nVxQ0z.webp", "biography": "As a Data Scientist at the Innovation Lab of the French National Institute of Statistics and Economic Studies (Insee), I focus on the deployment of machine learning models, the enhancement of MLOps best practices, and the development of torchTextClassifiers, a PyTorch package designed to streamline the training of deep learning models for text classification.\r\n\r\nI am also pursuing a PhD in Computer Science jointly at CREST and Inria, where my research centers on foundational Transformer-based models for the analysis of healthcare pathways.", "public_name": "Meilame Tayebjee", "guid": "f4434156-0043-5699-a16f-b3493828210c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/WULQUH/"}, {"code": "PFLWYG", "name": "C\u00e9dric Couralet", "avatar": "https://cfp.pydata.org/media/avatars/PFLWYG_EE7k6xu.webp", "biography": "C\u00e9dric Couralet, Data Scientist at Insee, is an open-source enthusiast, with expertise in software architecture and secure system design.", "public_name": "C\u00e9dric Couralet", "guid": "2b4b1583-c929-5392-9ad4-cc72dddb29f7", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/PFLWYG/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/JGSYEP/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/JGSYEP/", "attachments": []}, {"guid": "921e0c19-74cc-5a55-88dc-cbfc06648a7a", "code": "GMWTUK", "id": 78761, "logo": null, "date": "2025-12-09T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78761-harnessing-generative-models-for-synthetic-non-life-insurance-data", "url": "https://cfp.pydata.org/pydataglobal2025/talk/GMWTUK/", "title": "Harnessing Generative Models for Synthetic Non-Life Insurance Data", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "This study is oriented to a synthetic non-life insurance premium dataset generated using several Generative Models. As a benchmark, a Conditional Gaussian Mixture Model has been employed. The validation of the generated data involved several steps: data visualisation, comparison with univariate analysis, PCA and UMAP representations between the trained data and the generated samples. In addition, check the consistency of data produced, the statistical Kolmogorov\u2013Smirnov test and predictive modelling of frequency and severity with Generalised Linear Models (GLMs) exploited by Tweedie distribution as a measure of the generated data's quality, followed by the evidence of features importance. For further comparison, advanced Deep Learning architectures have been employed: Conditional Variational Autoencoders (CVAEs), CVAEs enhanced with a Transformer Decoder, a Conditional Diffusion Model, and Large Language Models. The analysis assesses each model\u2019s ability to capture the underlying distributions, preserve complex dependencies, and maintain relationships intrinsic to the premium data. These findings provide insightful directions for enhancing synthetic data generation in insurance, with potential applications in risk modelling, pricing strategies with data scarcity, and regulatory compliance.", "description": "In classification and regression tasks, generative models aim to learn the joint probability distribution of data. These models focus on generating data points similar to the training data. Open insurance datasets are rare because they encode proprietary risk structures of the Company, limiting researchers\u2019 access to comprehensive data  for analysis and assessing new approaches. Generative models enable reproducible experimentation and innovation today.\r\nIn the talk I explore several generative models used to produce synthetic data.\r\n\r\n1) Conditional Gaussian Mixture Models used as a benchmark;\r\n2) Conditional Variational Autoencoders;\r\n3) Conditional Variational Autoencoders with a Transformer Decoder;\r\n4) Conditional Diffusion Model;\r\n5) Large Language Models.\r\n\r\nFinally, I gave the overall results, followed by different approaches.", "recording_license": "", "do_not_record": false, "persons": [{"code": "VDZH9N", "name": "Claudio Giorgio Giancaterino", "avatar": "https://cfp.pydata.org/media/avatars/VDZH9N_3nkkwlr.webp", "biography": "Statistics & Actuarial background\r\nActuary during the day\r\nData Scientist in the free time", "public_name": "Claudio Giorgio Giancaterino", "guid": "c2335e02-33a4-50ec-a031-320c25ad3203", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/VDZH9N/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/GMWTUK/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/GMWTUK/", "attachments": []}, {"guid": "a8466af3-a18a-55e0-ba21-b206ed08aec4", "code": "RL9RDQ", "id": 78780, "logo": null, "date": "2025-12-09T15:00:00+00:00", "start": "15:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78780-from-feature-engineering-to-context-engineering-for-agents", "url": "https://cfp.pydata.org/pydataglobal2025/talk/RL9RDQ/", "title": "From Feature Engineering to Context Engineering for Agents", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Context Engineering for Agents involves getting relevant data into the LLM\u2019s prompt and builds on in-context learning capabilities of LLMs. But LLMs have finite sized context windows, so you can't just dump unprocessed context data into your Agent's LLM prompt. You need to select the right data, process it into the correct format, and compress or summarize the data before its use as context data. \r\n\r\nIn this talk, we will introduce techniques for selection, preprocessing, and compression of context data, taking inspiration from the tried and tested techniques used for feature engineering for ML. What goes around, comes around.", "description": "Context Engineering for Agents involves getting relevant data into the LLM\u2019s prompt and builds on in-context learning capabilities of LLMs. But LLMs have finite sized context windows, so you can't just dump unprocessed context data into your Agent's LLM prompt. You need to select the right data, process it into the correct format, and compress or summarize the data before its use as context data. \r\n\r\nIn this talk, we will introduce techniques for selection, preprocessing, and compression of context data, taking inspiration from the tried and tested techniques used for feature engineering for ML. What goes around, comes around.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DBWSDW", "name": "Jim Dowling", "avatar": "https://cfp.pydata.org/media/avatars/DBWSDW_FP9ANWD.webp", "biography": "Jim Dowling is CEO of Hopsworks and a former Associate Professor at KTH Royal Institute of Technology. He is the organizer of the annual feature store summit and co-organizer of PyData Stockholm. He is the author of an O'Reilly book on building ML systems: batch, real-time and LLMs.", "public_name": "Jim Dowling", "guid": "4ffcaec0-f4da-54c6-b8a3-045b1feb09cc", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DBWSDW/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/RL9RDQ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/RL9RDQ/", "attachments": []}, {"guid": "e580a9b6-44f3-599d-bcfc-55288fd0bbe5", "code": "JVPL8S", "id": 79352, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/JVPL8S/NeMo_b40VXqv.png", "date": "2025-12-09T17:30:00+00:00", "start": "17:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79352-scaling-data-processing-for-llms-with-nemo-curator", "url": "https://cfp.pydata.org/pydataglobal2025/talk/JVPL8S/", "title": "Scaling Data Processing for LLMs with NeMo Curator", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Training state-of-the-art Large Language Models (LLMs) increasingly rely on the availability of clean, diverse, and large-scale datasets. The traditional CPU-based preprocessing pipelines often become a bottleneck when curating datasets that span tens or hundreds of terabytes. In this talk, we introduce NeMo Curator, an open-source, GPU-accelerated data curation framework developed by NVIDIA. Built on Python and powered by RAPIDS, NeMo Curator enables scalable, high-throughput data processing for LLMs, including semantic deduplication, filtering, classification, PII redaction, and synthetic data generation. With support for multi-node, multi-GPU environments, the framework has demonstrated up to 7% improvement in downstream model performance on large-scale benchmarks. We will walk through its modular pipeline design, highlight real-world applications, and show how to integrate it into existing workflows for fast, reproducible, and efficient LLM training.", "description": "The development and performance of Large Language Models (LLMs) increasingly rely on the availability of high-quality, diverse, and representative datasets. Scaling data preparation for LLMs remains a significant bottleneck in training pipelines, particularly when dealing with massive raw web-scale data. Traditional CPU-based preprocessing frameworks are often too slow and resource-intensive to meet the growing demand for efficiency, scalability, and compliance. This talk presents NeMo Curator, an open-source, GPU-accelerated data curation framework designed to accelerate and streamline the preparation of massive datasets across multi-node, multi-GPU infrastructures.\r\n\r\nNeMo Curator introduces a modular pipeline architecture that enables high throughput preprocessing with native integration of RAPIDS for GPU acceleration. Its functionality spans semantic deduplication, heuristic filtering, automated classification, personally identifiable information (PII) redaction, and synthetic data generation. These features work in tandem to reduce noise, eliminate redundancy, and enhance data quality, ultimately improving LLM training outcomes. With support for reward-based filtering and configurable augmentation modules, NeMo Curator can generate or enhance data in low-resource domains while maintaining quality and diversity.\r\n\r\nThis talk will provide an informative walkthrough of NeMo Curator\u2019s capabilities and show how its pipelines can be integrated into existing workflows to preprocess massive datasets efficiently. Attendees will see how to configure and execute the framework through Python APIs, leveraging both single-node and distributed environments. By the end of this talk, participants will become familiar with scalable data curation techniques and walk away with practical tools to enhance their own LLM training pipelines using GPU-accelerated infrastructure.\r\n\r\nDetailed Outlines:\r\n1.\tChallenges in Scaling LLM Data Preparation (5 min)\r\n2.\tOverview of NeMo Curator Framework (10 min)\r\n3.\tPipeline Modules and Functional Components (5 min)\r\n4.\tDemonstration: Multi-GPU Pipeline Execution (5 min)\r\n5.\tCase Studies and Performance Metrics (5 min)\r\n\r\nTargeted Audience:\r\n\u2022\tData Scientist, ML/AI Engineer, AI Researcher", "recording_license": "", "do_not_record": false, "persons": [{"code": "BTDQLJ", "name": "Allison Ding", "avatar": "https://cfp.pydata.org/media/avatars/BTDQLJ_puO2yGF.webp", "biography": "Allison Ding is a developer advocate for GPU-accelerated AI APIs, libraries, and tools at NVIDIA, with a specialization in large language models (LLMs) and advanced data science techniques. She brings over eight years of hands-on experience as a data scientist, focusing on managing and delivering end-to-end data science solutions. Her academic background includes a strong emphasis on natural language processing (NLP) and generative AI. Allison holds a master\u2019s degree in Applied Statistics from Cornell University and a master\u2019s degree in Computer Science from San Francisco Bay University.", "public_name": "Allison Ding", "guid": "4d075d07-5a2c-53e4-a5ab-8ac9aa56690c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/BTDQLJ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/JVPL8S/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/JVPL8S/", "attachments": []}, {"guid": "78351a6a-689d-59b7-8b90-9c409b96a66d", "code": "QWXTAN", "id": 78526, "logo": null, "date": "2025-12-09T18:00:00+00:00", "start": "18:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78526-i-built-a-transformer-from-scratch-so-you-don-t-have-to", "url": "https://cfp.pydata.org/pydataglobal2025/talk/QWXTAN/", "title": "I Built a Transformer from Scratch So You Don\u2019t Have To", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Want to understand how transformers actually work without wading through 10,000 lines of framework code or drowning in tensor shapes? This talk walks you through building a transformer model from scratch \u2014 no pre-trained shortcuts, no black-box abstractions \u2014 just clean PyTorch code and good old-fashioned curiosity. You'll walk away with a clearer mental model of how attention, encoders, decoders, and masking really work.", "description": "Transformers power modern large language models, but their inner workings are often buried under complex libraries and unreadable abstractions. In this talk, we\u2019ll peel back the layers and build the original Transformer architecture (Vaswani et al., 2017) step by step in PyTorch, from input embeddings to attention masks to the full encoder-decoder stack.\r\n\r\nThis talk is designed for attendees with a basic understanding of deep learning and PyTorch who want to go beyond surface-level blog posts and get a hands-on, conceptual grasp of what happens under the hood. You'll see how each part of the transformer connects back to the equations in the original paper, how to debug common implementation pitfalls, and how to avoid getting lost in tensor dimension hell.\r\n\r\nThis talk features:\r\n\r\n\ud83d\udd0d A walkthrough of key components: attention, positional encoding, encoder/decoder stack\r\n\r\n\ud83e\udde0 Visual explanations of attention masks, shapes, and residuals\r\n\r\n\u26a0\ufe0f Common bugs and debugging strategies (like handling shape mismatches and masking errors)\r\n\r\n\u2705 Real-world implementation tips and tricks that demystify the architecture\r\n\r\nBy the end of the talk, attendees will:\r\n\r\nUnderstand the full forward pass of a transformer\r\n\r\nKnow how each component connects to the original paper\r\n\r\nFeel more confident reading or writing custom model architectures\r\n\r\nThe tone will be light-hearted and educational \u2014 ideal for those who are mathematically curious but don\u2019t want to get bogged down in heavy theory. No prior experience building models from scratch required \u2014 just a working knowledge of Python and PyTorch.\r\n\r\n**Prior Knowledge Expected**\r\n\r\nBasic Python and PyTorch\r\n\r\nSome familiarity with neural networks (e.g., feedforward, softmax)\r\n\r\nNo need for prior experience in building models from scratch", "recording_license": "", "do_not_record": false, "persons": [{"code": "7LWGUU", "name": "Jen Wei", "avatar": "https://cfp.pydata.org/media/avatars/7LWGUU_6jJxl0Y.webp", "biography": "Jen Wei is an independent AI research engineer with a PhD in applied mathematics and a love for building things from scratch \u2014 especially when she probably shouldn\u2019t. She\u2019s reverse-engineered transformer architectures, implemented modern techniques like mixture-of-experts and Multi-head latent attention, and still enjoys writing clean PyTorch code at 2am for fun (and maybe for revenge). Jen currently works in the GenAI space and shares her work openly on Hugging Face. Her favorite research topics include efficient LLM architecture, post-training techniques, and the existential crises of overparameterized models.\r\n\r\n[\ud83c\udf38 Personal Website](https://birdofparadise.ai/experience/)\r\n[\ud83e\udd17 repo](https://huggingface.co/bird-of-paradise)\r\n[\ud835\udc17](https://x.com/JenniferWe17599)\r\n[Medium](https://medium.com/@jenwei0312)\r\n[\ud83d\udcbc LinkedIn profile](https://www.linkedin.com/in/jenweiprofile)", "public_name": "Jen Wei", "guid": "91e469a2-1bd6-5cff-9ae9-a419d3a7b22b", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/7LWGUU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/QWXTAN/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/QWXTAN/", "attachments": []}], "Analytics, Visualization & Decision Science": [{"guid": "a77e9960-cfe2-5adc-ac65-34235b54239e", "code": "8NYGXU", "id": 78697, "logo": null, "date": "2025-12-09T12:30:00+00:00", "start": "12:30", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78697-scaling-fuzzy-product-matching-with-bm25-a-comparative-study-of-python-and-database-solutions", "url": "https://cfp.pydata.org/pydataglobal2025/talk/8NYGXU/", "title": "Scaling Fuzzy Product Matching with BM25: A Comparative Study of Python and Database Solutions", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "Tired of exact matches failing on messy data? This talk showcases how BM25, a powerful fuzzy search algorithm, tackles the challenge of enriching massive datasets with noisy product names. We'll compare practical, large-scale implementations using Python's bm25s library (accelerated by GPUs) and DuckDB's built-in full-text search. Join us to learn how to achieve fast, accurate data integration and discover the optimal tools for your fuzzy matching needs.", "description": "**The problem at hand:**\r\nAre you constantly battling messy, inconsistent product names across massive datasets? Traditional exact matching just doesn't cut it when you're trying to integrate data from various sources (like a 1-million-row internal catalog with a 3.8-million-row external one like Open Food Facts). This talk addresses that exact problem: how to efficiently and accurately find fuzzy matches, saving you countless hours of manual reconciliation and enabling robust data enrichment. It's crucial for anyone working with real-world, imperfect data at scale.\r\n\r\n**Is this talk for me?**\r\nThis talk is for data engineers, data scientists, and analytics professionals who work with large-scale datasets and face challenges with data integration, record linkage, or building robust search functionalities. A basic understanding of dataframes and SQL will be helpful, but no deep prior knowledge of search algorithms is required.\r\n\r\nThis will be an informative and practical talk with a clear focus on real-world application. While we'll briefly cover the \"why\" behind BM25, the emphasis will be on \"how\" to implement and optimize it. We'll present concrete benchmarks and code examples, moving beyond theoretical concepts.\r\n\r\n**What will I learn?**\r\nBy the end of this session, you will:\r\n- Understand why BM25 is a superior choice for fuzzy matching noisy product names compared to traditional methods.\r\n- See a practical, head-to-head comparison of implementing BM25 using Python libraries (specifically the optimized Cython bm25s) and DuckDB's native full-text search.\r\n- Gain insights into performance implications (speed and memory usage) for each approach on large datasets, including the benefits of GPU acceleration with Dask CuDF.\r\n- Learn production tips for persisting indexes, handling bulk queries, and managing memory effectively.\r\n- Be equipped to choose the most suitable BM25 implementation for your specific data enrichment and fuzzy matching needs, allowing you to build faster and more accurate data pipelines.\r\n\r\n**Any pre-requisite knowledge I should have?**\r\n- A medium level background in python\r\n- An introductory level information about DuckDB\r\n- An introductory level information into how BM25 works would be bonus!", "recording_license": "", "do_not_record": false, "persons": [{"code": "DRM8YW", "name": "Aniket Abhay Kulkarni", "avatar": "https://cfp.pydata.org/media/avatars/DRM8YW_VQUeHmp.webp", "biography": "Aniket is an engineer at heart. He has founded Curlscape, where he helps businesses bring practical AI applications to life fast. He has led the design and deployment of large-scale systems across industries, from finance and healthcare to education and logistics. His work spans LLM-based information extraction, agentic workflows, voice assistants, and continuous evaluation frameworks.", "public_name": "Aniket Abhay Kulkarni", "guid": "320884d4-708b-5899-ba7a-9c39fb59fd8f", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DRM8YW/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/8NYGXU/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/8NYGXU/", "attachments": []}, {"guid": "b1977105-5e0b-5f9f-84d0-95a11089199d", "code": "VHX7E7", "id": 78768, "logo": null, "date": "2025-12-09T13:30:00+00:00", "start": "13:30", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78768-lessons-learnt-in-optimizing-a-large-scale-pandas-application-using-polars-fireducks-and-cudf-go-smart-and-save-more", "url": "https://cfp.pydata.org/pydataglobal2025/talk/VHX7E7/", "title": "Lessons learnt in optimizing a large-scale pandas application using Polars, FireDucks and cuDF: Go Smart and Save More!", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "In general, a Data Scientist spends significant efforts in transforming the raw data into a more digestible format before training an AI model or creating visualisations. Traditional tools such as pandas have long been the linchpin in this process, offering powerful capabilities but not without limitations. With numerous possible ways to write the same thing in pandas, often a user ends up selecting the uneconomical, inefficient ones, leading to large computational\u3000costs\u3000with the growth in data size. We introduce a couple of frequently occurring\u3000intricate performance issues in pandas, and what we have learnt in solving the same using popular high-performance pandas alternatives: Polars, FireDucks and cuDF. The talk intends to highlight one of the best practices (breaking out of the loops) that one should follow while dealing with large-scale data analysis, while demonstrating the key advantages of the high-performance pandas alternatives based on different scenarios.", "description": "It is a known factor that pandas might be slow when dealing with large-scale data analysis, but the know-how of writing effective pandas application might save you a lot. For a data scientist who is primarily specialized in finding the key insights out of the data, it might be difficult to program from the perspective of runtime memory consumption, effective data flow optimization etc. High-performance pandas alternatives like Polars, FireDucks, cuDF etc. are designed to address these issues and can be very useful in saving a lot of operational cost (e.g., cloud cost, human cost etc.). We will talk about the key lessons we have learnt in optimizing a large-scale pandas application and the decision points in selecting the high-performance pandas alternatives. It can be very useful for the contemporary data professional who loves the flexible user APIs in pandas and wants to enhance the performance of their application without much effort when dealing with voluminous and complex data on a regular basis. \r\n\r\nThe key takeaways would be as follows:\r\n  1. How the choice and execution order of API calls in writing an data-related application impacts its performance.\r\n  2. How to stop thinking the loop-based approach and design the algorithms using DataFrame APIs. \r\n  3. How the internal query optimizers in libraries like Polars, FireDucks etc, can be useful to bring SQL-like optimizations at python-level.\r\n  4. Whether to pay a large migration cost for optimizing an existing pandas-based application or to go smart with some minor modifications and save more operational cost.\r\n\r\nHere is the [presentation deck](https://github.com/qsourav/PyData-Global-2025/blob/main/docs/PyDataGlobal_20251209.pdf) used during the talk.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DC8SFX", "name": "Sourav Saha", "avatar": "https://cfp.pydata.org/media/avatars/DC8SFX_6Q12YbJ.webp", "biography": "Sourav has 12+ years of professional experience at NEC Corporation in the diverse fields of High-Performance Computing, Distributed Programming, Compiler Design, and Data Science. Currently, his team at NEC R&D Lab, Japan, is researching various data processing-related algorithms. Blending the mixture of different niche technologies related to compiler framework, high-performance computing, and multi-threaded programming, they have developed a Python library named FireDucks with highly compatible pandas APIs for DataFrame-related operations. In his previous engagements, he has worked in research and development of performance-critical AI and Big Data solutions, optimization of several legacy applications related to weather prediction, earth-quake simulation, etc., written in C++ and Fortran. He has been speaking at several meetups and technical conferences related to HPC and Data Science.", "public_name": "Sourav Saha", "guid": "05b03119-1998-5d35-b814-e8d545cd0dda", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DC8SFX/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/VHX7E7/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/VHX7E7/", "attachments": []}, {"guid": "85399bfe-bf18-5edc-b197-771dd63e12c8", "code": "9CMRXJ", "id": 78738, "logo": null, "date": "2025-12-09T19:30:00+00:00", "start": "19:30", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78738-communicating-data-quality-making-the-invisible-visible-and-fun-with-pointblank", "url": "https://cfp.pydata.org/pydataglobal2025/talk/9CMRXJ/", "title": "Communicating Data Quality: Making the Invisible Visible (and Fun!) with Pointblank", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "Ensuring and communicating data quality (DQ) is one of the most persistent challenges in data-driven organizations. Data scientists, engineers, and analysts often struggle not just with detecting DQ issues, but with presenting those issues in actionable ways for diverse stakeholders across an organization (e.g., pipeline owners, fellow developers, less-technical colleagues, etc). On top of this, DQ work has an image problem as it can be seen as tedious, opaque, or even adversarial.\r\n\r\nThis talk introduces Pointblank, a Python package designed to make data quality validation and communication both robust and approachable. The library provides a comprehensive set of tools for profiling, validating, and reporting on data quality. There\u2019s a strong focus on beautiful and actionable outputs as well. It can help you to generate tabular validation reports, data summaries, and granular error reporting that make it easy for anyone (technical or not) to understand what\u2019s wrong and why.\r\n\r\nAttendees will learn how Pointblank can help their teams not only catch data issues early, but also communicate them effectively, fostering a culture of shared responsibility for data quality. The talk will include live demos of common DQ workflows, showing how Pointblank turns a traditionally painful process into something transparent, productive, and even a little bit fun.", "description": "The overall goal of this talk is to get people excited about DQ and show how the Pointblank library makes DQ validation and communication easier, clearer, and more collaborative. I\u2019ll demonstrate some practical workflows that will hopefully inspire attendees to treat DQ as a shared (yet approachable) responsibility.\r\n\r\nHere\u2019s an outline for this talk:\r\n\r\n1. The Data Quality Communication Problem\r\n- why DQ is hard: technical, social, and organizational barriers\r\n- the \u201clast mile\u201d problem: not just finding issues, but making them clear and actionable\r\n- the validation plan, execution, and report lifecycle \r\n\r\n2. Introducing Pointblank\r\n- overview of the package and its philosophy: affordances for humans, not just machines\r\n- key features: validation, profiling, reporting, and workflow support\r\n\r\n3. Making Data Quality Actionable\r\n- live demo: Python API for data profiling, validation, and missing value reports\r\n- nice-looking outputs: tabular report, step-by-step summaries, and crystal-clear DQ messaging\r\nhow these outputs can help people get to the root of DQ problems faster\r\n\r\n4. Flexible Workflows\r\n- using LLMs to draft a validation plan\r\n- creating a validation plan from YAML\r\n- integrating with CI/CD and data pipelines\r\n\r\n5. Designing this Library for Collaboration and Fun\r\n- small design choices can make a big difference: easy-to-understand summaries, actionable extracts, and a user-friendly CLI\r\n- my personal goal: make DQ work less annoying and more rewarding\r\n\r\nI imagine the intended audience as being composed of data engineers, scientists, analysts, and anyone responsible for data quality. Also, this talk might interest team leads and managers looking to improve DQ culture in their organization. Insofar as skill level, this talk is suitable for Python users at any level.", "recording_license": "", "do_not_record": false, "persons": [{"code": "YXSERE", "name": "Richard Iannone", "avatar": "https://cfp.pydata.org/media/avatars/YXSERE_TFszZoO.webp", "biography": "Rich is a software engineer that enjoys working with Python. He likes to create Python libraries that help people to accomplish things. While Rich very clearly digs programming, he enjoys other things as well! Examples include: playing and listening to music, reading books, watching films, meeting up with friends, and wandering through the many valleys and ravines of the Greater Toronto Area.", "public_name": "Richard Iannone", "guid": "6803086d-fa1f-5a7a-88d3-7bf50bd84a06", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/YXSERE/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/9CMRXJ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/9CMRXJ/", "attachments": []}], "Data Engineering & Infrastructure": [{"guid": "e0d8721c-7286-53a9-b30c-b891e2eb699f", "code": "HKWFL8", "id": 78783, "logo": null, "date": "2025-12-09T12:30:00+00:00", "start": "12:30", "duration": "01:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78783-fast-cost-efficient-analytics-on-blockchain-data-using-duckdb-solana-as-a-case-study", "url": "https://cfp.pydata.org/pydataglobal2025/talk/HKWFL8/", "title": "Fast, Cost-Efficient Analytics on Blockchain data using DuckDB - Solana as a case study", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Tutorial", "language": "en", "abstract": "**Abstract:**\r\n\r\nBlockchain generates millions of transactions daily, making it a rich yet complex source of data for developers, analysts, and researchers. While Google BigQuery offers public access to Solana\u2019s historical data, repeated querying at scale can become costly and slow, especially during iterative exploration and analysis.\r\n\r\nIn this talk, I\u2019ll demonstrate a practical workflow that combines the power of BigQuery for data extraction with the speed and flexibility of DuckDB for local, in-memory analytics. We\u2019ll show how to efficiently query Solana data in BigQuery, export it to partitioned Parquet files, and use DuckDB to run fast, repeatable SQL queries without incurring additional cloud costs.\r\n\r\nYou'll learn:\r\n- Basic terms in blockchain data structure and how transactions are saved.\r\n- How to navigate and query Solana\u2019s public datasets on BigQuery.\r\n- How to export filtered blockchain data to efficient Parquet files.\r\n- How DuckDB can serve as a lightweight analytics engine for on-chain data.\r\n- Tips for partitioning, enriching, and automating your Solana data pipeline.\r\n\r\nThis demo would all run within Google collab to save time and also enable participant follow through the session.\r\n\r\nWhether you're working on blockchain analytics, wallet behavior analysis, or on-chain data engineering, this talk will equip you with a practical approach to blockchain data workflows using open tools.", "description": "This talk explores how to build a workflow for Solana blockchain data using BigQuery and DuckDB. You'll learn how to query Solana\u2019s public datasets in BigQuery, export key data as Parquet files, and use DuckDB for high-speed, ideal for blockchain developers, data engineers, and analysts working with large on-chain datasets.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JN9HKN", "name": "Busirah Olaitan Hammed", "avatar": "https://cfp.pydata.org/media/avatars/JN9HKN_8xswVF9.webp", "biography": "Busirah Hammed is a data engineer at YellowCard financial with over 6 years experience building data solutions. She's a data enthusiast whose experience spans across data science and engineering.", "public_name": "Busirah Olaitan Hammed", "guid": "8a8d479a-61d4-5bca-ae9e-9555cd291bfa", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/JN9HKN/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/HKWFL8/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/HKWFL8/", "attachments": []}, {"guid": "a6073b42-1733-5893-bfe8-db7c001696a4", "code": "CDEZQQ", "id": 79394, "logo": null, "date": "2025-12-09T15:00:00+00:00", "start": "15:00", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-79394-designing-a-fast-offline-capable-reverse-geocoder-in-python-an-open-source-alternative-to-big-geo-apis", "url": "https://cfp.pydata.org/pydataglobal2025/talk/CDEZQQ/", "title": "Designing a Fast, Offline-Capable Reverse Geocoder in Python: An Open Source Alternative to Big Geo APIs", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "While commercial reverse geocoding APIs, such as Google Maps or Mapbox, are effective, they are also costly, have rate limitations, and are not appropriate for offline or privacy-sensitive settings.\r\n\r\nUsing available datasets and Python modules like `cKDTree`, `shapely`, and `geopandas`, we will demonstrate how to create a quick, scalable, offline-capable reverse geocoding system in Python in this session.\r\n\r\nYou will learn how to:\r\n- Convert geographic shapefiles into effective spatial indices  \r\n- Perform location lookups in milliseconds using tree search and vector mathematics  \r\n- Handle edge cases like unclear borders, cities with identical names, and GPS noise  \r\n- Improve performance and memory usage through multiprocessing\r\n\r\nThe system is fully open source and has been production-tested in a high-throughput environment. Whether you are developing applications for edge inference, mapping, or logistics, this talk will help you take control of your geospatial infrastructure without depending on costly commercial APIs.", "description": "Reverse geocoding \u2014 converting coordinates into readable place names \u2014 is a core building block of applications in logistics, mapping, mobility, and location intelligence. Yet developers are often locked into commercial APIs that are expensive, rate-limited, and unsuitable for offline or privacy-first use cases.\r\n\r\nIn this talk, we\u2019ll walk through the architecture and implementation of a fast reverse geocoding engine built entirely in Python using open-source tooling. You\u2019ll see how spatial data (such as OpenStreetMap shapefiles) can be indexed efficiently using `scipy`'s `cKDTree`, queried with millisecond latency, and integrated into real-world systems.\r\n\r\nWe\u2019ll explore performance trade-offs, data preprocessing techniques, and methods for dealing with ambiguous or noisy GPS data. The session includes benchmarks and a live walkthrough of the code powering the reverse geocoder \u2014 which is lightweight enough to run on a laptop or edge device.\r\n\r\nAttendees will leave with a clear understanding of how to build and adapt this system for their own needs \u2014 and gain insight into how geospatial systems work behind the scenes.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LS8QLS", "name": "Sooraj Sivadasan", "avatar": null, "biography": null, "public_name": "Sooraj Sivadasan", "guid": "a43e31e3-89ba-5bea-bdb2-612628335295", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/LS8QLS/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/CDEZQQ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/CDEZQQ/", "attachments": []}, {"guid": "fd2bec27-8989-505d-bc3a-57d4bcd8c71a", "code": "3BLRCH", "id": 78844, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/3BLRCH/2025-07-14_19-42-30_n0fcoVY.png", "date": "2025-12-09T15:30:00+00:00", "start": "15:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78844-enhancing-apache-nifi-2-x-with-python-processors", "url": "https://cfp.pydata.org/pydataglobal2025/talk/3BLRCH/", "title": "Enhancing Apache NiFi 2.x with Python Processors", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "In this talk, I will delve into the world of Apache NiFi 2.0 Python processors, exploring the capabilities they offer and demonstrating how to build custom processors to enhance your data processing pipelines.\r\n\r\nBy the end of this talk, participants will have a comprehensive understanding of building and optimizing Apache NiFi 2.0 Python processors, enabling them to integrate Python seamlessly into their data processing workflows.\r\n\r\nThis session is suitable for data engineers, architects, and anyone interested in harnessing the combined power of Apache NiFi and Python for efficient data integration and flow management. One of the main uses is to build prompts and call open LLM and AI. NiFi excels at integration, I will cover some interesting sources, sinks and enrichments and show when Python is helpful.", "description": "In this talk, I will delve into the world of Apache NiFi 2.0 Python processors, exploring the capabilities they offer and demonstrating how to build custom processors to enhance your data processing pipelines.\r\n\r\nBy the end of this talk, participants will have a comprehensive understanding of building and optimizing Apache NiFi 2.0 Python processors, enabling them to integrate Python seamlessly into their data processing workflows.\r\n\r\nThis session is suitable for data engineers, architects, and anyone interested in harnessing the combined power of Apache NiFi and Python for efficient data integration and flow management. One of the main uses is to build prompts and call open LLM and AI. NiFi excels at integration, I will cover some interesting sources, sinks and enrichments and show when Python is helpful.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ABY8GL", "name": "Timothy Spann", "avatar": "https://cfp.pydata.org/media/avatars/ABY8GL_4Jg56aW.webp", "biography": "https://github.com/tspannhw/SpeakerProfile\r\n\r\nTim Spann is a Senior Solutions Engineer @ Snowflake. He works with Generative AI, LLM, Snowflake, SQL, HuggingFace, Python, Java, Apache NiFi, Apache Kafka, Apache Pulsar, Apache Flink, Flink SQL, Apache Spark, Big Data, IoT, Cloud, AI/DL, Machine Learning, and Deep Learning. Tim has over ten years of experience with the IoT, big data, distributed computing, messaging, streaming technologies, and Java programming. Previously, he was a Principal Developer Advocate at Zilliz, Principal Developer Advocate at Cloudera, Developer Advocate at StreamNative, Principal DataFlow Field Engineer at Cloudera, a Senior Solutions Engineer at Hortonworks, a Senior Solutions Architect at AirisData, a Senior Field Engineer at Pivotal and a Senior Team Leader at HPE. He blogs for DZone, where he is the Big Data Zone leader, and runs a popular meetup in Princeton & NYC on Big Data, Cloud, IoT, deep learning, streaming, NiFi, the blockchain, and Spark. Tim is a frequent speaker at conferences such as ApacheCon, DeveloperWeek, Pulsar Summit and many more. He holds a BS and MS in Computer Science.", "public_name": "Timothy Spann", "guid": "991b4ee9-3197-5153-bc8f-3487b3683e82", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/ABY8GL/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/3BLRCH/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/3BLRCH/", "attachments": []}, {"guid": "0b310fe7-af1f-56b5-819e-33488099353e", "code": "AAGRYV", "id": 79466, "logo": null, "date": "2025-12-09T16:30:00+00:00", "start": "16:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-79466-combining-zarr-hdf5-and-tiff-into-a-single-data-format", "url": "https://cfp.pydata.org/pydataglobal2025/talk/AAGRYV/", "title": "Combining Zarr, HDF5, and TIFF into a single data format", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "TIFF, HDF5, and Zarr represent a few choices to store large n-dimensional arrays which represent scientific and machine learning data. Trade-offs have to be considered when selecting one of these formats. While TIFF files are recognized by many applications particularly for imaging, they are limited in the number of dimensions, two, traditionally, or three in the case of GeoTIFF. HDF5 was created to support hierarchical scientific data with arrays up to 32 dimensions, but are mainly readable by scientific applications. Neither TIFF nor HDF5 were designed with the cloud in mind. Meanwhile, Zarr reimagined HDF5 in the era of cloud computing and key-value object stores. In retrospect, these disparate formats have many similarities. I will demonstrate how to take advantage of these similarities to combine the formats and make data accessible to a wide range of local and cloud-based application without duplicating the data itself.", "description": "Choosing a standard format for high dimensional (N >= 2) array data is challenging in that one must consider trade-offs between compatible software packages, cloud optimization, and complexity, yet the need for such data has increased with recent advances in machine learning and volumetric imaging in the earth and biological sciences. The 927th installment of the XKCD comic series illustrates how standards proliferate: the existence of many prior and imperfect standards portends the creation of yet another standard to supplant the ones that came before often without considering similarities or compatability with prior standard formats. For n-dimensional data, TIFF, HDF5, and Zarr are now common formats in use across various fields and scientific domains. While TIFF and HDF5 were designed decades ago with flexible metadata structures, cloud optimization of these formats have helped to consolidate metadata in these formats and narrow the differences with the cloud-native file format Zarr. While Zarr has traditionally used individual keys for each compressed chunk, version 3 of the format introduces a sharding codec allowing multiple chunks to exist in the same file under a single key. The consolidation of chunks is reminiscent of tiles in TIFF files or chunked datasets in HDF5. Essentially each of these file formats have the capability to describe the location and sizes of individual blocks of data contained within. By taking advantage of metadata consolidation to achieve modularity, we can tailor and combine these formats to point to the same data blocks, avoiding duplication. The result is a hybrid file format that is simultaneously a TIFF, HDF5, and Zarr v3 shard. Readers of any of these formats can be used to read the same data blocks contained within this format.\r\n\r\nTo illustrate the concept of a combined Zarr, HDF5, and TIFF format, I have created an example Jupyter notebook demonstrating a small Python library that can write data in this hybrid format. I then show how data can be read using libtiff, h5py, or tensorstore, manipulated by h5py, and then have the changes read using the same libraries.\r\nhttps://github.com/mkitti/simple_image_formats/blob/main/header_formats.ipynb", "recording_license": "", "do_not_record": false, "persons": [{"code": "USMVVE", "name": "Mark Kittisopikul, Ph.D.", "avatar": "https://cfp.pydata.org/media/avatars/USMVVE_LEXahCX.webp", "biography": "I am a Software Engineer III at the Janelia Research Campus of the Howard Hughes Medical Institute. I specialize in working with data from light microscopy drawing upon my experience as a postdoctoral cell biologist.", "public_name": "Mark Kittisopikul, Ph.D.", "guid": "a35affbc-7050-5773-aec2-8e67ac099bd9", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/USMVVE/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/AAGRYV/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/AAGRYV/", "attachments": []}, {"guid": "7c053cfa-0a4f-50c8-8b0d-6d1376050c40", "code": "9HUY9G", "id": 78723, "logo": null, "date": "2025-12-09T19:30:00+00:00", "start": "19:30", "duration": "01:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78723-gpu-python-for-the-real-world-practical-steps-to-gpu-accelerated-python-with-rapids", "url": "https://cfp.pydata.org/pydataglobal2025/talk/9HUY9G/", "title": "GPU Python for the Real World: Practical Steps to GPU-Accelerated Python with RAPIDS", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Tutorial", "language": "en", "abstract": "NVIDIA GPUs offer unmatched speed and efficiency for data processing and model training, significantly reducing the time and cost associated with these tasks. Using GPUs is even more tempting when you use zero-code-change plugins and libraries. You can use PyData libraries including pandas, polars and networkx without needing to rewrite your code to get the benefits of GPU acceleration. We can also mix in GPU native libraries like Numba, CuPy and pytorch to accelerate our workflows from end-to-end.\r\n\r\nHowever, integrating GPUs into our workflow can be a new challenge where we need to learn about installation, dependency management, and deployment in the Python ecosystem. When writing code, we also need to monitor performance, leverage hardware effectively, and debug when things go wrong\r\n\r\nThis is where RAPIDS and its tooling ecosystem comes to the rescue. RAPIDS, is a collection of open source software libraries to execute end-to-end data pipelines on NVIDIA GPUs using familiar PyData APIs.", "description": "In this tutorial we will cover:\r\n- Introduction to cuDF, cuML and more that showcases a simple example of data processing and model training on GPUs.\r\n- Answers to questions like: \u201cWhere do I get a GPU?\u201d, \u201cHow do I run a container on a VM with a GPU?\u201d, \u201cHow do I install GPU packages into an existing environment?\u201d, as well as follow along examples to get a GPU up and running.\r\n- Troubleshooting and monitoring:  Examples of performance analysis, diagnostics, and debugging.\r\n\r\nThis is a hands-on tutorial, with multiple examples to get familiarized with the RAPIDS ecosystem. Participants should ideally have some experience using Python, pandas and sci-kit learn. We'll use cloud-based VMs, so familiarity with the cloud and resource creation is helpful but not required. No prior GPU knowledge is needed.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EE7H7J", "name": "Jacob Tomlinson", "avatar": "https://cfp.pydata.org/media/avatars/EE7H7J_KtXk3nN.webp", "biography": "Jacob Tomlinson is a senior software engineer at NVIDIA. His work involves maintaining open source projects including RAPIDS and Dask. He also tinkers with kr8s in his spare time. He lives in Exeter, UK.", "public_name": "Jacob Tomlinson", "guid": "effa7a13-dc1e-59d2-ad5d-09840937dc0c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/EE7H7J/"}, {"code": "QUDSNA", "name": "Naty Clementi", "avatar": "https://cfp.pydata.org/media/avatars/QUDSNA_zs2wahg.webp", "biography": "Naty Clementi is a senior software engineer at [NVIDIA](https://www.nvidia.com/). She is a former academic with a Masters in Physics and PhD in Mechanical and Aerospace Engineering to her name. Her work involves contributing to [RAPIDS](https://rapids.ai/), and in the past she has also contributed and maintained other open source projects such as [Ibis](https://ibis-project.org/) and [Dask](https://www.dask.org/). She is an active member of [PyLadies](https://pyladies.com/) and an active volunteer and organizer of [Women and Gender Expansive Coders DC meetups](https://www.meetup.com/women-and-gender-expansive-coders-dc-wgxc-dc/).", "public_name": "Naty Clementi", "guid": "8f3a5689-2347-5f6b-bd61-42adb4d75334", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/QUDSNA/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/9HUY9G/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/9HUY9G/", "attachments": []}], "Live from PyData Boston": [{"guid": "3d339794-9d12-5914-b9df-680cda87fb96", "code": "8CFCDH", "id": 84930, "logo": null, "date": "2025-12-09T15:00:00+00:00", "start": "15:00", "duration": "00:40", "room": "Live from PyData Boston", "slug": "pydataglobal2025-84930-the-lifecycle-of-a-jupyter-environment-from-exploration-to-production-grade-pipelines", "url": "https://cfp.pydata.org/pydataglobal2025/talk/8CFCDH/", "title": "The Lifecycle of a Jupyter Environment: From Exploration to Production-Grade Pipelines", "subtitle": "", "track": "Live from PyData Boston", "type": "Talk", "language": "en", "abstract": "Most data science projects start with a simple notebook\u2014a spark of curiosity, some exploration, and a handful of promising results. But what happens when that experiment needs to grow up and go into production?\r\n\r\nThis talk follows the story of a single machine learning exploration that matures into a full-fledged ETL pipeline. We\u2019ll walk through the practical steps and real-world challenges that come up when moving from a Jupyter notebook to something robust enough for daily use.\r\n\r\nWe\u2019ll cover how to:\r\n\r\n- Set clear objectives and document the process from the beginning\r\n- Break messy notebook logic into modular, reusable components\r\n- Choose the right tools (Papermill, nbconvert, shell scripts) based on your workflow\u2014not just the hype\r\n- Track environments and dependencies to make sure your project runs tomorrow the way it did today\r\n- Handle data integrity, schema changes, and even evolving labels as your datasets shift over time\r\n\r\nAnd as a bonus: bring your results to life with interactive visualizations using tools like PyScript, Voila, and Panel + HoloViz", "description": "- (3 mins) Intro \r\n    - I've been supporting various groups in their developer experience since 2020 after being a freelance Python consultant. I've worked on many many dozens of projects, unblocking users picking the right tools for the task at hand. \r\n    - It works on my machine \r\n    - What we're building today: ML pipeline with RAPIDS -> Snowflake\r\n    - We're going to watch a real project grow up\r\n- (3 mins) Exploration - starting as a single messy notebook, sample data set. \r\n    - Why RAPIDS? GPU\r\n        - Large data sets\r\n        - GPU availability - remote machine, local GPU\r\n        - workflows that work well with GPU \r\n    - Load Data cuDF / pandas\r\n    - Quick EDA and data visualization\r\n    - Train cuML / scikit-learn model \r\n    - no-code change philosophy\r\n- (7 mins) Make it repeatable - Start with simple tried and true tools, explore where tools like Papermill help with flexibilty and reproducibility\r\n    - common painpoints: operating cadence, specialized scenarios, manual execution is error prone\r\n    - shell scripts versus papermill \r\n    - reproducible environments\r\n    - generate HTML reports\r\n    - pass through parameters in your notebook\r\n- (8 mins) Make it reliable - Modular code & testing\r\n    - common painpoints: data schema changes, debugging issues, testing & modularity\r\n    - nbconvert + Python: turn your notebook into a script\r\n    - turn a function into a module\r\n    - dashboard with HoloViz / Panel, discuss choosing tools like Voila and PyScript\r\n- (5 mins) Snowflake integration\r\n    - common painpoints: data volume, coordinate with other data systems, audits\r\n    - picking the right tools: cost complexity tradeoff\r\n    - RAPIDS preprocessing to Snowflake storage\r\n    - self-service access for stakeholders\r\n- (3 mins) Conclusion \r\n    - Start simple\r\n    - Add complexity when you feel specific pain", "recording_license": "", "do_not_record": false, "persons": [{"code": "LDQPN9", "name": "Dawn Wages", "avatar": "https://cfp.pydata.org/media/avatars/LDQPN9_BEpPpGr.webp", "biography": null, "public_name": "Dawn Wages", "guid": "a05c5500-4058-5c2e-90bb-c88f9129226a", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/LDQPN9/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/8CFCDH/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/8CFCDH/", "attachments": []}, {"guid": "5d355aac-155d-52bf-b48e-0a7950f1b0e8", "code": "UHN9UX", "id": 84933, "logo": null, "date": "2025-12-09T16:15:00+00:00", "start": "16:15", "duration": "00:40", "room": "Live from PyData Boston", "slug": "pydataglobal2025-84933-using-traditional-ai-and-llms-to-automate-complex-and-critical-documents-in-healthcare", "url": "https://cfp.pydata.org/pydataglobal2025/talk/UHN9UX/", "title": "Using Traditional AI and LLMs to Automate Complex and Critical Documents in Healthcare", "subtitle": "", "track": "Live from PyData Boston", "type": "Talk", "language": "en", "abstract": "Informed Consent Forms (ICFs) are critical documents in clinical trials. They are the first, and often most crucial, touchpoint between a patient and a clinical trial study. Yet the process of developing them is laborious, high-stakes, and heavily regulated. Each form must be tailored to jurisdictional requirements and local ethics boards, reviewed by cross-functional teams, and written in plain language that patients can understand. Producing them at scale across countries and disease areas demands manual effort and creates major operational bottlenecks. We used a combination of traditional AI and large language models to autodraft the ICF across clinical trial types, across countries and across disease areas at scale. The build, test, iteration and deployment offers both technical and non technical lessons learned for generative AI applications for complex documents at scale and for meaningful impact.", "description": "Informed Consent Forms are highly complex documents that require high precision and quality. A phase 2 / 3 clinical trial can have almost 1000 different forms that takes considerable time to complete.We identified this challenge that directly impacts trial timelines and patient engagement. The automated AI solution: the \u201cICF Autodrafter\u201d, a custom LLM-powered application that automates the drafting of ICFs. This tool ingests a clinical trial protocol and ICF template and outputs a complete draft in minutes, cutting document preparation time by 90%. \r\n\r\nThis solution is not generic automation. The backend logic parses highly structured protocol documents, segments them, and feeds the relevant content into a carefully fine-tuned LLM that maps text to specific ICF fields. The front-end is designed for usability by clinical trial managers, with human-in-the-loop reviews. This system has already supported ICF creation for more than ten trials and has achieved near-perfect consistency (97%) with human-generated content, underscoring the speed, quality, and robustness of the solution. \r\n\r\nWe rigorously test version with A/B comparisons, iterated with feedback from end-users, and anchored all development within regulatory and ethical guardrails. The impact extends beyond efficiency. By standardizing and accelerating ICF production, we can reduce delays in trial start-up and potentially get medicines to patients faster, without compromising safety, compliance, or clarity. Furthermore, it also lays down a scalable model for future AI-driven document workflows across other parts of life sciences and healthcare.", "recording_license": "", "do_not_record": false, "persons": [{"code": "J7MWBW", "name": "Aman Bhandari", "avatar": "https://cfp.pydata.org/media/avatars/J7MWBW_kOKAFrB.webp", "biography": null, "public_name": "Aman Bhandari", "guid": "ad13cfd4-e59a-5771-9e58-426384d35f89", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/J7MWBW/"}, {"code": "BCCTCH", "name": "Lily Xu", "avatar": "https://cfp.pydata.org/media/avatars/BCCTCH_p9TVnql.webp", "biography": null, "public_name": "Lily Xu", "guid": "7ec9adc6-bf6a-58b7-a968-368a1e387dff", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/BCCTCH/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/UHN9UX/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/UHN9UX/", "attachments": []}, {"guid": "39b89708-3446-5f6f-b17c-6bb64fb14339", "code": "V7GSU7", "id": 84934, "logo": null, "date": "2025-12-09T17:00:00+00:00", "start": "17:00", "duration": "00:40", "room": "Live from PyData Boston", "slug": "pydataglobal2025-84934-where-have-all-the-metrics-gone", "url": "https://cfp.pydata.org/pydataglobal2025/talk/V7GSU7/", "title": "Where Have All the Metrics Gone?", "subtitle": "", "track": "Live from PyData Boston", "type": "Talk", "language": "en", "abstract": "How exactly does one validate the factuality of answers from a Retrieval-Augmented Generation (RAG) system? Or measure the impact of the new system prompt for your customer service agent? What do you do when stakeholders keep asking for \"accuracy\" metrics that you simply don't have? In this talk, we\u2019ll learn how to define (and measure) what \u201cgood\u201d looks like when traditional model metrics don\u2019t apply.", "description": "In the good old supervised learning days, standard measures like accuracy, F1, and MSE were like blazes on the data science trail, showing us how to descend the gradient towards \"better\". But now we're in uncharted analytics territory, where our work increasingly involves unlabeled data and generative AI outputs, and metrics are either unavailable or undefined.\r\n\r\nThe key to every successful trek is preparation. We have to move from thinking about \u201cmetrics as defaults\u201d to \u201cmetrics as design choices.\" We also need to be ready to design those metrics before we even start testing, because when we devise metrics post-training, we risk HARKing (Hypothesizing After Results are Known) and losing our scientific footing. \r\n\r\nThis talk will provide a field guide for translating different kinds of modern research questions into clearly-defined metrics, including:\r\n* Metrics of the past and why they aren't as useful now (~5 min)\r\n* Common failure modes when attempting to evaluate generative AI outputs and other unlabeled data (~8 min)\r\n* Techniques for identifying proxies when labels are missing (~8 min)\r\n* Defining criteria for open-ended outputs (~8 min)\r\n* Open source Python libraries (including new tools like [outlines](https://github.com/dottxt-ai/outlines) and [dspy](https://github.com/stanfordnlp/dspy) as well as old favorites like [hypothesis](https://hypothesis.readthedocs.io/en/latest/) and [pytest](https://docs.pytest.org/en/stable/)) to equip you for your next data science adventure (~8 min)\r\n\r\nCome learn how to define and adapt new metrics\u00a0so that you'll be prepared for wherever your modeling journey takes you.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CNDFND", "name": "Dr. Rebecca Bilbro", "avatar": "https://cfp.pydata.org/media/avatars/CNDFND_qXA7i4B.webp", "biography": null, "public_name": "Dr. Rebecca Bilbro", "guid": "047c1526-e7c3-5d44-a773-0b3c56d04d38", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/CNDFND/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/V7GSU7/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/V7GSU7/", "attachments": []}, {"guid": "2f085ad1-ff0d-5437-9668-60e24752d1ad", "code": "YHTMZY", "id": 84936, "logo": null, "date": "2025-12-09T19:30:00+00:00", "start": "19:30", "duration": "00:40", "room": "Live from PyData Boston", "slug": "pydataglobal2025-84936-the-sat-math-gap-gender-difference-or-selection-bias", "url": "https://cfp.pydata.org/pydataglobal2025/talk/YHTMZY/", "title": "The SAT math gap: gender difference or selection bias?", "subtitle": "", "track": "Live from PyData Boston", "type": "Talk", "language": "en", "abstract": "Why do male test takers consistently score about 30 points higher than female test takers on the mathematics section of the SAT? Does this reflect an actual difference in math ability, or is it an artifact of selection bias\u2014if young men with low math ability are less likely to take the test than young women with the same ability?\r\n\r\nThis talk presents a Bayesian model that estimates how much of the observed difference can be explained by selection effects. We\u2019ll walk through a complete Bayesian workflow, including prior elicitation with PreliZ, model building in PyMC, and validation with ArviZ, showing how Bayesian methods disentangle latent traits from observed outcomes and separate the signal from the noise.\r\n\r\nNo prior knowledge of Bayesian statistics is required; attendees should be familiar with Python and common probability distributions.", "description": "Overview\r\n\r\nThis talk uses the SAT math gap as a case study to demonstrate modern Bayesian modeling in practice. For decades, male test takers have outperformed female test takers on the SAT math section by about 30 points. This outcome could reflect an actual difference in ability, or it could be explained by selection bias, if boys with weaker math skills are less likely to take the SAT than girls with comparable skills.\r\nI present a generative Bayesian model that explicitly incorporates this selection mechanism and estimates the fraction of the observed gap attributable to bias. The talk emphasizes workflow over theory: how to build, validate, and interpret Bayesian models using PyMC, ArviZ, and PreliZ.\r\n\r\nAudience\r\n\r\nThe target audience includes data scientists, applied researchers, and engineers who:\r\n* Use Python for data analysis,\r\n* Have basic familiarity with probability distributions,\r\n* Are curious about Bayesian modeling but do not necessarily have prior experience with PyMC or Bayesian statistics.\r\n\r\nLearning goals\r\n\r\nAttendees will learn:\r\n\r\n* How to frame a substantive question as a Bayesian generative model,\r\n\r\n* How to use PreliZ for prior elicitation, PyMC for model building, and ArviZ for diagnostics and posterior predictive checks,\r\n\r\n* How to interpret results in terms of latent traits vs. observed outcomes,\r\n\r\n* How Bayesian models can provide a principled way to reason about confounding and bias.\r\n\r\n\r\nOutline (approx. 30\u201340 minutes)\r\n\r\nIntroduction & background (5 min)\r\n \u2013 The SAT math gap and the debate over its causes\r\n \u2013 Why Bayesian inference is a good fit for this problem\r\n\r\nModel construction (10 min)\r\n \u2013 Latent efficacy distribution\r\n \u2013 Selection mechanism (logistic link)\r\n \u2013 Noise modeling for score perturbations\r\n\r\nWorkflow demonstration (15 min)\r\n \u2013 Prior elicitation with PreliZ\r\n \u2013 Sampling and diagnostics with PyMC and ArviZ\r\n \u2013 Posterior predictive checks\r\n\r\nResults & interpretation (5\u20137 min)\r\n \u2013 Estimated contribution of selection bias to the observed gap\r\n \u2013 Broader implications for educational testing and applied modeling\r\n\r\nTakeaways (3\u20135 min)\r\n \u2013 Lessons about Bayesian workflow\r\n \u2013 Relevance to real-world problems of bias and confounding\r\n\r\n\r\nMaterials\r\n\r\nAll code and data preprocessing will be available in a public GitHub repository so attendees can reproduce the analysis and adapt it to their own work.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ECV9N3", "name": "Allen  Downey", "avatar": "https://cfp.pydata.org/media/avatars/ECV9N3_O7lEiTp.webp", "biography": null, "public_name": "Allen  Downey", "guid": "fd6888b8-1d9f-5317-accc-05fab45b7326", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/ECV9N3/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/YHTMZY/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/YHTMZY/", "attachments": []}, {"guid": "23e137bd-c764-5ade-8e38-2ba38ecb155e", "code": "TZSWMW", "id": 84937, "logo": null, "date": "2025-12-09T20:45:00+00:00", "start": "20:45", "duration": "00:40", "room": "Live from PyData Boston", "slug": "pydataglobal2025-84937-the-boringly-simple-loop-powering-genai-apps", "url": "https://cfp.pydata.org/pydataglobal2025/talk/TZSWMW/", "title": "The Boringly Simple Loop Powering GenAI Apps", "subtitle": "", "track": "Live from PyData Boston", "type": "Talk", "language": "en", "abstract": "Do you feel lost in the jungle of GenAI frameworks and buzzwords? Here's a way out. Take any GenAI app, peel away the fluff, and look at its core. You'll find the same pattern: a boringly simple nested while loop. I will show you how this loop produces chat assistants, AI agents, and multi-agent systems. Then we'll cover how RAG, tool-calling, and memory are like lego bricks we add as needed. This gives you a first-principles based map. Use it to build GenAI apps from scratch; no frameworks needed.", "description": "### Central Thesis\r\nWe are at a point where talking about GenAI apps has become more complex than building them. Social media is obsessed with the \"top 10 libraries for GenAI\", search engines are swamped with shallow tutorials, and many devs I meet are rightfully confused what frameworks they should spend time on.\r\n\r\nThe answer is \"none, GenAI isn't all that complicated\". However, that answer isn't sexy because it doesn't grab attention, doesn't sell consulting hours, and doesn't convince someone to buy an online course. Hence few people give it. That has to change!\r\n\r\nThat's what this talk is about: The boringly simple basics of building GenAI apps and how you can use a simple nested while loop to build assistants, AI agents, or multi-agent systems. Sometimes less is more.\r\n\r\n### Takeaways\r\n- Create prototypes of agentic apps from scratch using fundamental building blocks\r\n- Choose the right components (like RAG or MCP) for your specific problem\r\n- Debug agentic apps by spotting misconfigured context\r\n\r\n### Target Audience\r\nThis talk is for the software engineer and data professional that wants to get hands-on with GenAI. Medium and Substack taught you concepts like RAG and AI Agents, social media hyped you up, and now it\u2019s time to build. The only problem: Where do you start? How do you turn \"let's build something that does XYZ\" into a concrete software product? If you feel like you are sitting with a pile of Lego pieces while everyone else is playing with a completed spaceship, this talk is for you. It's for builders who are ready to go from reading to coding.\r\n\r\n### Prerequisites\r\nYou should have working knowledge of Python and familiarity with LLM terminology (tokens, context window, system prompt, ...). If you're comfortable reading source code, you have everything you need. No prior experience with frameworks like LangChain, LlamaIndex, or others is necessary.\r\n\r\n### Outline\r\n**Introduction** (2 min)\r\n\r\n**The core loop** (15 min)\r\n- Introduction to the fundamental pattern that orchestrates GenAI apps (the \"core loop\")\r\n- Definition of the terms \"Turns\" and \"Traces\" that are foundational to building and optimizing flows\r\n- Showcase on how to create assistants, workflows, AI agents, and multi-agent systems using this pattern\r\n\r\n**Context Engineering** (15min)\r\n- Introduction to the three parts of context engineering: Plans, Knowledge, and Tools.\r\n- Discussion on how these parts relate to the core loop and where to define them\r\n- Showcase how RAG, MCP, memory, etc. assist in setting up the system context\r\n\r\n**Q&A** (5min)\r\n**Buffer** (3min)\r\n\r\n### Bio\r\nI'm an engineer and open-source maintainer with a PhD in Computer Science and over a decade of hands-on experience building with AI/ML. Having scaled ImageIO, a foundational Python library, from 2 to 35 million monthly downloads, I know what it takes to build robust, scalable software. I co-founded PyData Stockholm and am deeply integrated into our data community. My current focus is to bring first principles thinking to the GenAI landscape and help developers build more robust systems.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DNLRAG", "name": "Sebastian Wallk\u00f6tter", "avatar": "https://cfp.pydata.org/media/avatars/DNLRAG_bQg0QJQ.webp", "biography": null, "public_name": "Sebastian Wallk\u00f6tter", "guid": "7df90aee-5c21-5a35-b7cb-2a406a0aa9cc", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DNLRAG/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/TZSWMW/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/TZSWMW/", "attachments": []}]}}, {"index": 2, "date": "2025-12-10", "day_start": "2025-12-10T04:00:00+00:00", "day_end": "2025-12-11T03:59:00+00:00", "rooms": {"General Track": [{"guid": "dc1a59c5-5361-5a5d-8515-0a2b890ce0cb", "code": "B3QRQA", "id": 79476, "logo": null, "date": "2025-12-10T12:00:00+00:00", "start": "12:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-79476-pydata-sparse-finch-extending-sparse-computing-in-the-python-ecosystem", "url": "https://cfp.pydata.org/pydataglobal2025/talk/B3QRQA/", "title": "PyData/Sparse & Finch: extending sparse computing in the Python ecosystem", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Scientific Python Ecosystem offers a wide variety of numerical packages, such as NumPy, CuPy, or JAX. One of the domains that also captures a lot of attention in the community is sparse computing.\r\n\r\nIn this talk, we will present the current landscape of sparse computing in the Python ecosystem and our efforts to revive/expand it. Our main contributions to the Python ecosystem cover: (1) making a novel Finch sparse tensor compiler and Galley scheduler available for the community, (2) standardizing various aspects of sparse computing. We will show how to use the Finch compiler with the PyData/Sparse package and how it outperforms well-established alternatives for multiple kernels, such as MTTKRP or SDDMM.\r\n\r\nReal-world use-cases will show you how, step-by-step, Python practitioners can migrate their code to an Array API compatible version and benefit from tensor operator fusion and autoscheduling capabilities offered by the Finch compiler.\r\n\r\nApart from the existing Julia implementation, the number of sparse backends offered by PyData/Sparse will grow in the future to provide a Python-native alternatives for scipy.sparse and Numba solutions. One of them that is currently under development is finch-tensor-lite, a pure Python rewrite of Finch.jl compiler, meant to make the solution lightweight by dropping Julia runtime dependency while providing the majority of features.", "description": "In this talk we're going to understand the current landscape of sparse computing in the Python ecosystem first. Then a high-level overview of the Finch technology and compiler's architecture will be presented together with other solutions vital for the project: Array API Standard and binsparse format.\r\n\r\nNext, we're going to present a selected set of benchmarks - also focusing on real world use-cases: how Finch impacts users' experience when writing sparse programs in Python. Last but not least a showcase of the current development will be shown - pure Python rewrite of Finch compiler.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TABXBG", "name": "Mateusz Sok\u00f3\u0142", "avatar": "https://cfp.pydata.org/media/avatars/TABXBG_mK5y5fC.webp", "biography": "I'm a Software Engineer at Quansight, working on multitude of open source projects in the Scientific Python Ecosystem. You can find my GitHub profile here: https://github.com/mtsokol", "public_name": "Mateusz Sok\u00f3\u0142", "guid": "41d4c37d-4287-5427-881e-70661cefbd9a", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/TABXBG/"}, {"code": "99ELXS", "name": "Willow Marie Ahrens", "avatar": "https://cfp.pydata.org/media/avatars/99ELXS_VUq0vD3.webp", "biography": "Willow Ahrens\r\nWillow Ahrens\r\n\r\nAssistant Professor in the School of Computer Science at Georgia Tech.\r\n\r\n    Office 3144, Klaus Advanced\r\n    Computing Building, Georgia Tech\r\n    Email\r\n    GitHub\r\n    ORCID\r\n    Google Scholar\r\n\r\nAbout\r\n\r\nI am an assistant professor in the School of Computer Science at Georgia Tech. I am inspired to make programming high-performance computers more productive, efficient, and accessible. My research focuses on using compilers to accelerate productive programming languages with state-of-the-art datastructures, algorithms, and architectures, bridging the gap between program flexibility and performance. I\u2019m the author of the [Finch](https://github.com/finch-tensor/Finch.jl) sparse tensor programming language. Finch supports general programs on general tensor formats, such as sparse, run-length-encoded, banded, or otherwise structured tensors. Please reach out if you are interested in doing research at Georgia Tech!", "public_name": "Willow Marie Ahrens", "guid": "ca3d12d6-1568-5633-b798-1b554012ce5c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/99ELXS/"}], "links": [{"title": "Finch Tensor website", "url": "https://finch-tensor.org/", "type": "related"}], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/B3QRQA/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/B3QRQA/", "attachments": []}, {"guid": "cc5bdd29-7d7b-5a6d-806e-e99580d9d22e", "code": "NMYJM8", "id": 78722, "logo": null, "date": "2025-12-10T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78722-effver-versioning-code-by-the-effort-required-to-upgrade", "url": "https://cfp.pydata.org/pydataglobal2025/talk/NMYJM8/", "title": "EffVer: Versioning code by the effort required to upgrade", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Many notable PyData projects including Jupyter Hub, Matplotlib and JAX follow a versioning scheme called EffVer, where instead of making promises around backward compatibility they communicate the likelihood and magnitude of the work required to adopt a new version.\r\n\r\nIn this talk we will dive into EffVer, what it is and what it means for developers and users. We will discuss how to apply EffVer to your own projects and how to depend on projects that use it.", "description": "Intended Effort Versioning (EffVer), the version scheme where you just tell your users what order of magnitude to expect the upgrade effort to be.\r\n\r\nVersion numbers are hard to get right. Semantic Versioning (SemVer) communicates backward compatibility via version numbers which often lead to a false sense of security and broken promises. Calendar Versioning (CalVer) sits at the other extreme of communicating almost no useful information at all.\r\n\r\nMany Python projects follow a looser scheme called EffVer where instead of making promises around backward compatibility they communicate the likelihood and magnitude of work required to adopt a new version.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EE7H7J", "name": "Jacob Tomlinson", "avatar": "https://cfp.pydata.org/media/avatars/EE7H7J_KtXk3nN.webp", "biography": "Jacob Tomlinson is a senior software engineer at NVIDIA. His work involves maintaining open source projects including RAPIDS and Dask. He also tinkers with kr8s in his spare time. He lives in Exeter, UK.", "public_name": "Jacob Tomlinson", "guid": "effa7a13-dc1e-59d2-ad5d-09840937dc0c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/EE7H7J/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/NMYJM8/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/NMYJM8/", "attachments": []}, {"guid": "ff2709e2-78c2-5ace-9da3-a38a4561282d", "code": "UXHBEZ", "id": 78708, "logo": null, "date": "2025-12-10T13:30:00+00:00", "start": "13:30", "duration": "01:30", "room": "General Track", "slug": "pydataglobal2025-78708-hands-on-with-blosc2-accelerating-your-python-data-workflows", "url": "https://cfp.pydata.org/pydataglobal2025/talk/UXHBEZ/", "title": "Hands-on with Blosc2: Accelerating Your Python Data Workflows", "subtitle": "", "track": "General Track", "type": "Tutorial", "language": "en", "abstract": "As datasets grow, I/O becomes a primary bottleneck, slowing down scientific computing and data analysis. This tutorial provides a hands-on introduction to Blosc2, a powerful meta-compressor designed to turn I/O-bound workflows into CPU-bound ones. We will move beyond basic compression and explore how to structure data for high-performance computation.\r\n\r\nParticipants will learn to use the python-blosc2 library to compress and decompress data with various codecs and filters, optimizing for speed and ratio. The core of the tutorial will focus on the Blosc2 NDArray object, a chunked, N-dimensional array that lives on disk or in memory. Through a series of interactive exercises, you will learn how to perform out-of-core mathematical operations and analytics directly on compressed arrays, effectively handling datasets larger than available RAM.\r\n\r\nWe will also cover practical topics like data storage backends, two-level partitioning for faster data slicing, and how to integrate Blosc2 into existing NumPy-based workflows. You will leave this session with the practical skills needed to significantly accelerate your data pipelines and manage massive datasets with ease.", "description": "## Audience & Prerequisites\r\n\r\nThis tutorial is for data scientists, engineers, and researchers who work with large numerical datasets in Python.\r\n\r\nPrerequisites: Attendees should have intermediate Python programming skills and be comfortable with the basics of NumPy arrays. No prior experience with Blosc2 is necessary.\r\n\r\nSetup: Participants will need a laptop and can follow along using a provided cloud-based environment (e.g., Binder) or a local installation of Python, Jupyter, and the python-blosc2 library.\r\n\r\n## Learning Objectives\r\n\r\nBy the end of this tutorial, attendees will be able to:\r\n\r\n* Understand the core concepts behind the Blosc2 meta-compressor.\r\n* Compress and decompress NumPy arrays, tuning parameters for optimal performance.\r\n* Create, manipulate, and slice Blosc2 NDArray objects for out-of-core processing.\r\n* Perform efficient mathematical computations directly on compressed data.\r\n* Store and retrieve compressed datasets using different storage backends.\r\n* Integrate Blosc2 into their existing data analysis workflows to mitigate I/O bottlenecks.\r\n\r\n## Outline (90 minutes)\r\n\r\n### Introduction & Setup (10 mins)\r\n\r\n  * The I/O Bottleneck Problem.\r\n  * Core Concepts: What are meta-compressors, chunks, and blocks?\r\n  * Tutorial environment setup (Jupyter notebooks).\r\n\r\n### Part 1: Compression Fundamentals (20 mins)\r\n\r\n  * Hands-on: Using blosc2.compress() and blosc2.decompress().\r\n  * Exploring codecs (lz4, zstd), compression levels, and filters (shuffle, bitshuffle).\r\n  * Exercise: Compressing a sample dataset and analyzing the trade-offs between speed and ratio. \r\n\r\n### Part 2: The NDArray - Computing on Compressed Data (35 mins)\r\n\r\n  * Hands-on: Creating NDArray objects from scratch and from NumPy arrays.\r\n  * Storing arrays on-disk vs. in-memory.\r\n  * Exercise: Slicing and accessing data from an on-disk NDArray.\r\n  * Performing mathematical operations (arr * 2 + 1) and reductions (arr.sum()) on compressed data.\r\n  * Exercise: Analyzing a dataset larger than RAM.\r\n\r\n### Part 3: Advanced Features & Integration (20 mins)\r\n\r\n  * Hands-on: Using two-level partitioning (meta-chunks) for faster slicing.\r\n  * Brief overview of Caterva2 for sharing compressed data via an API.\r\n  * Recap and Q&A.\r\n\r\nRepository: Tutorial materials including notebooks and datasets will be available at a public GitHub repository (link to be provided upon acceptance).", "recording_license": "", "do_not_record": false, "persons": [{"code": "9NVTBY", "name": "Francesc Alted", "avatar": "https://cfp.pydata.org/media/avatars/9NVTBY_VJmx64s.webp", "biography": "I am a curious person who studied Physics (BSc, MSc) and Applied Maths (MSc). I spent over a year at [CERN](https://home.cern) for my MSc in High Energy Physics. However, I found maths and computer sciences equally fascinating, so I left academia to pursue these fields. Over the years, I developed a passion for handling large datasets and using compression to enable their analysis on commodity hardware accessible to everyone.\r\n\r\nI am the CEO of [ironArray SLU](https://ironarray.io) and also leading the [Blosc Development Team](https://blosc.org), and currently interested in determining, ahead of time, which [combinations of codecs and filters can provide a personalized compression experience](https://ironarray.io/btune). I am also very excited in providing a way for sharing Blosc2 datasets in the network in an easy and effective way via [Caterva2](https://ironarray.io/caterva2-doc/index.html), and [Cat2Cloud](https://ironarray.io/cat2cloud), a software as a service for handling and computing with datasets directly in the cloud.\r\n\r\nAs an Open Source believer, I started the [PyTables project more than 20 years ago](https://www.blosc.org/posts/pytables-20years/).  After 25 years in this business, I started several other useful open source projects like [Blosc2](https://blosc.org), [Caterva2](https://github.com/ironArray/Caterva2) and [Btune](https://github.com/ironArray/Blosc2-Btune); those efforts won me two prizes that mean a lot to me:\r\n\r\n* 2023: [NumFOCUS Project Sustainability Award](https://www.blosc.org/docs/NumFOCUS-Sustainability-Award.jpg)\r\n* 2017: [Google\u2019s Open Source Peer Bonus](https://opensource.googleblog.com/2017/10/more-open-source-peer-bonus-winners.html)\r\n\r\nYou can know more on what I am working on by reading my [latest blogs](https://www.blosc.org/authors/francesc-alted).", "public_name": "Francesc Alted", "guid": "00dedc3e-7285-5779-a36c-aff03278c2c0", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/9NVTBY/"}, {"code": "WFFNSW", "name": "Luke Shaw", "avatar": "https://cfp.pydata.org/media/avatars/WFFNSW_vNXtApx.webp", "biography": "Degree in Physics, Princeton University, 2019\r\nMasters in Applied Mathematics, University of Edinburgh, 2020\r\nPhD in Applied Mathematics, Universitat Jaume I 2024\r\nWorking at ironArray as engineer and product owner since 2025.", "public_name": "Luke Shaw", "guid": "7870638e-1f99-530d-8948-c655a7a7ebb3", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/WFFNSW/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/UXHBEZ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/UXHBEZ/", "attachments": []}, {"guid": "f37c0c4f-754c-5dc1-8f66-131d41e17fd0", "code": "NKQFBQ", "id": 86007, "logo": null, "date": "2025-12-10T16:00:00+00:00", "start": "16:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-86007-keynote-david-aronchick-from-pandas-to-policy-as-code-the-future-of-ml-data-engineering", "url": "https://cfp.pydata.org/pydataglobal2025/talk/NKQFBQ/", "title": "Keynote: David Aronchick- From Pandas to Policy-as-Code: The Future of ML Data Engineering", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Machine learning teams today are drowning in massive volumes of raw, redundant data that inflate training costs, slow down experimentation, and degrade model quality. The core architectural flaw is that we apply control too late, after the data has already been moved into centralized stores or training clusters, creating waste, instability, and long iteration cycles. What if we could fix this problem right at the source?\r\n\r\nIn this talk, we\u2019ll discuss a playbook for shifting ML data filtering, transformation, and governance upstream, directly where data is generated. We\u2019ll walk through a declarative, policy-as-code framework for building distributed pipelines that intelligently discard noise, balance datasets, and enrich signals before they ever reach your model training infrastructure.\r\n\r\nDrawing from real-world ML workflows, we\u2019ll show how this \u201cupstream control\u201d approach can reduce dataset size, cut model onboarding time in half, and embed reproducibility and compliance directly into the ML lifecycle rather than patching them in afterward.\r\n\r\nAttendees will leave with:\r\n- A mental model for analyzing and optimizing the ML data supply chain.\r\n- An understanding of tools for declarative, source-level ML data controls.\r\n- Actionable strategies to accelerate iteration, lower training costs, and improve model outcomes.", "description": "For over a decade, the Python ecosystem has given us a powerful arsenal to tame data. We started with the interactive magic of Pandas on a single machine, a revolutionary step that made complex analysis accessible. When our ambitions (and data) outgrew our laptops, we turned to Dask and Spark to scale our computations across clusters. More recently, projects like Apache Arrow began solving the critical problem of creating a standardized, efficient language for these distributed systems to speak.\r\n\r\nEach step in this journey solved a painful bottleneck. Yet, in our success, we've created a new one: the runaway cost and complexity of the \"ingest-it-all-first\" paradigm. Our cloud bills have become a tax on raw, unfiltered data, and our elegant downstream tools\u2014from Airflow and dbt to our own ML models\u2014are forced to waste expensive cycles sifting through noise just to find the signal.\r\n\r\nThis talk argues for the next logical step in our stack's evolution: an Upstream Data Control Plane. We'll explore an playbook for applying intelligent filtering, transformation, and governance before data ever hits your expensive lakehouse. Just as Dask parallelized our processing and Arrow standardized our memory, this approach optimizes our data in motion, ensuring that our powerful downstream systems operate only on the high-value signals we care about. Join us to learn a declarative, policy-as-code framework that makes your entire data stack cheaper, faster, and more resilient.", "recording_license": "", "do_not_record": false, "persons": [{"code": "L8WCLF", "name": "David Aronchick", "avatar": "https://cfp.pydata.org/media/avatars/L8WCLF_FdnmfFp.webp", "biography": "David Aronchick is CEO of Expanso (expanso.io), the global, intelligent pipeline company.\r\n\r\nPreviously, he led Compute over Data at Protocol Labs, Open Source Machine Learning Strategy at Azure, was a product management for Kubernetes on behalf of Google, launched Google Kubernetes Engine, and co-founded the Kubeflow project and the SAME project. He has also worked at Amazon, Chef and co-founded three startups.\r\n\r\nWhen not spending too much time in service of electrons, he can be found on a mountain (on skis). traveling the world (via restaurants) or participating in kid activities, of which there are a lot more than he remembers than when he was that age.", "public_name": "David Aronchick", "guid": "d3a9d6c3-8252-582e-97cc-fe8270ef9209", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/L8WCLF/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/NKQFBQ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/NKQFBQ/", "attachments": []}, {"guid": "2f2fbfa5-386d-5932-8b73-222d80cbf292", "code": "BSY9GA", "id": 78694, "logo": null, "date": "2025-12-10T16:30:00+00:00", "start": "16:30", "duration": "01:30", "room": "General Track", "slug": "pydataglobal2025-78694-python-polars-the-definitive-crash-course", "url": "https://cfp.pydata.org/pydataglobal2025/talk/BSY9GA/", "title": "Python Polars: The Definitive Crash Course", "subtitle": "", "track": "General Track", "type": "Tutorial", "language": "en", "abstract": "Polars is a lightning fast DataFrame library that is taking the data science community by storm. Its elegant and expressive API makes analyses pleasant to write and efficient to run. In this workshop, we\u2019ll demonstrate how Polars enables data scientists to go from raw data to reports\u2013by reading, transforming, and visualizing data.", "description": "Based on the book Python Polars: The Definitive Guide, we\u2019ll teach the essentials of Polars to read, transform, and visualize data. While a hallmark of Polars is its speed, we\u2019ll emphasize the benefits of its expression system for writing flexible, maintainable code.\r\n\r\nThis hands-on workshop will cover:\r\n\r\n* Reading data from CSV, spreadsheets, Parquet, and databases\r\n* Common transformations such as selecting, filtering, sorting, and aggregating\r\n* Complex data types, including text, time, and nested structures \r\n* Expressions, the building blocks of every query\r\n* Visualizing data\r\n\r\nBy the end of this workshop, attendees will have gained a solid understanding of Polars, and be equipped to start applying this lightning fast DataFrame library to their own datasets. No prior knowledge of Polars is required.", "recording_license": "", "do_not_record": false, "persons": [{"code": "C7AFFQ", "name": "Jeroen Janssens", "avatar": "https://cfp.pydata.org/media/avatars/C7AFFQ_jZMuQv9.webp", "biography": "Jeroen Janssens, PhD, is Head of Developer Relations at Posit, PBC. His expertise lies in visualizing data, implementing machine learning models, and building solutions using Python, R, JavaScript, and Bash. He\u2019s passionate about open source and sharing knowledge. He\u2019s the author of Python Polars: The Definitive Guide (O\u2019Reilly, 2025) and Data Science at the Command Line (O\u2019Reilly, 2021). Jeroen holds a PhD in machine learning from Tilburg University and an MSc in artificial intelligence from Maastricht University. He lives with his wife and two kids in Rotterdam, the Netherlands.", "public_name": "Jeroen Janssens", "guid": "eba629fb-1976-5196-b046-7dc5a7495533", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/C7AFFQ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/BSY9GA/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/BSY9GA/", "attachments": []}, {"guid": "dcd50da4-a35d-5109-9651-73cf821b20ae", "code": "N7EAFM", "id": 78145, "logo": null, "date": "2025-12-10T18:00:00+00:00", "start": "18:00", "duration": "01:30", "room": "General Track", "slug": "pydataglobal2025-78145-time-series-analysis-for-coupled-neurons", "url": "https://cfp.pydata.org/pydataglobal2025/talk/N7EAFM/", "title": "Time series analysis for coupled neurons.", "subtitle": "", "track": "General Track", "type": "Tutorial", "language": "en", "abstract": "The complex nervous system provides a repertoire of evolutionary properties like neuron spiking, bursting, and chaos that are yet to be fully understood. One approach is to tackle these time-dependent properties using the technique of \"dynamical systems\u201d, such as ordinary differential equations. Since the popular work by Hodgkin and Huxley, many dynamical systems models of neurons have been proposed, of which FitzHugh\u2013Nagumo and Morris\u2013Lecar models draw special attention. The nervous system is made of a network of neurons, possessing a complex structural and functional topology. This topology is a function of different parameters, among which the coupling strength plays a major role. Our focus would be to systematically study the effect of various coupling strategies on the firing patterns exhibited by a collection of neurons. In this workshop, my goal is to popularize a reduced-order model of neuron dynamics known as the \u201cdenatured Morris\u2013Lecar\u201d system and to teach how Python can be efficiently used to perform research on time series analysis of coupled neurons.", "description": "This is a tutorial on hands-on time series analysis of coupled neuron models. We will build mathematical models of coupled neurons, and then utilize tools from nonlinear dynamics to analyze simulated time series. We will discuss various empirically informed coupling strategies and statistically efficient time series measures. This workshop is 100% Jupyter notebook and will have room to openly brainstorm ideas to extend and improve the studies. Let\u2019s unravel some complex dynamics together!\r\n\r\nThe pipeline of this tutorial will be the following:\r\n(i) Start by building a coupled neuron system based on different coupling strategies,\r\n(ii) Simulate the system and generate time series data,\r\n(iii) Perform time series analysis by computing various metrics from the nonlinear dynamics literature,\r\n(iv) Finally, discuss what these metrics tell us about the temporal behavior of neurons.\r\n\r\nCoupling strategies we are going to look at:\r\n(i) Gap junction coupling\r\n(ii) Chemical coupling\r\n(iii) A hybrid coupling influenced by a superconductor model in physics\r\n(iv) Electromagnetic coupling\r\n(iv) Coupling, which is not pairwise but higher-order (A bit of background on graph theory is recommended)\r\n(v) A random coupling strategy\r\n\r\nWe will implement the following methodologies/algorithms for time series analysis of coupled neuron models:\r\n(i) Hurst exponent: measuring persistence of time series,\r\n(ii) Sample entropy: measuring the complexity of time series,\r\n(iii) 0\u20131 test: measuring chaos,\r\n(iv) Kuramoto order-parameter: measuring synchrony between the neurons.\r\n\r\nThis tutorial is 100% Python. And I will be utilizing Jupyter Notebooks to deliver the workshop. Packages that need to be downloaded beforehand are:\r\n(i) `matplotlib` for plotting,\r\n(ii) `numpy` and `scipy` for scientific computations,\r\n(iii) `nolds` for nonlinear measure for dynamical systems,\r\n(iv) `pandas` for data handling.\r\n\r\nThe audience would find this interesting because it would be a hands-on introduction to how the mechanisms of neurons can be explored using different tools from the nonlinear dynamics literature. Mathematically modelling the dynamics of neurons has attracted several researchers in recent years because of the popularity of artificial intelligence. This field of neuron dynamics is booming, and delivering this workshop would be timely. I would also ensure to leave some room for brainstorming further ideas with the audience and how this study could be potentially extended and improved, thus an interactive session. \r\n\r\nThe goal is to attract applied mathematicians, computer scientists, data scientists, engineers, and statisticians alike and provide them with a battery of tools to add to their knowledge base. The audience would then be able to apply these tools in domains other than neurodynamics, for example, climate, finance, or social science. The only technical background I would expect from the audience is familiarity with `matplotlib`, `numpy` and `pandas`, and some basic statistics (regression, correlation coefficient), linear algebra (matrix operations), and graphs (as in networks). After the tutorial, the audience will leave with a newly built insight into the mathematical modeling of neuron dynamics.\r\n\r\nHere is the breakdown of the tutorial:\r\n\r\n0\u201315 mins: Introduction to neurons as dynamical systems and why we care about their behavior over time. We will talk about a single neuron's behavior and the selection of a mathematical model. We will also talk about the bursting phenomenon in neurons.\r\n\r\n15-30 mins: We will then mathematically model a coupled system of neurons. We will cover the topic of  \u201csmall networks\u201d of neurons and what they teach us about the bigger picture: a complex, connected nervous system.\r\n\r\n30-45 mins: Next, we will introduce various empirically informed coupling mechanisms. We will talk about how these couplings incorporate different firing patterns in the coupled neurons, ranging from regular behavior to chaotic firing.\r\n\r\n45-75 mins: Finally, I will introduce time series analysis of neuron data. We will then implement the algorithms mentioned above to realize different dynamical properties of the neurons.\r\n\r\n75-90 mins: Open the room to QA and brainstorm further ideas to improve/extend the analysis of neuron-time series data.\r\n\r\nAll materials for the tutorial can be accessed via this repository link: https://github.com/indrag49/PyData-Global-Tutorial-2025", "recording_license": "", "do_not_record": false, "persons": [{"code": "DCJ8DG", "name": "Indranil Ghosh", "avatar": "https://cfp.pydata.org/media/avatars/DCJ8DG_Kg8DEd6.webp", "biography": "Indra is a postdoctoral fellow in applied mathematics at Massey University, New Zealand, working on all things \"dynamical systems\". He takes a computational approach to tackle complex problems, and his current research is focused on understanding collective behaviour exhibited by coupled neurons. He is an avid Python user and has been a speaker at multiple Python-related conferences before. More information can be found in his website: https://indrag49.github.io/.", "public_name": "Indranil Ghosh", "guid": "1ed6bd7b-73d2-5185-8a55-a00e84ec5ebd", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DCJ8DG/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/N7EAFM/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/N7EAFM/", "attachments": []}], "Machine Learning & AI": [{"guid": "e213fc0a-af6b-5390-b345-683f8d9d8965", "code": "ETQTHC", "id": 79349, "logo": null, "date": "2025-12-10T11:30:00+00:00", "start": "11:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79349-using-mcp-to-turn-claude-into-a-football-opposition-analyst", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ETQTHC/", "title": "Using MCP to turn Claude into a Football Opposition Analyst", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Advanced statistics are transforming sports analysis, but many coaches and ex-players struggle to access meaningful insights due to complex data and jargon. Generative AI offers a solution. \r\n\r\nIn this talk, I\u2019ll demonstrate how I used Model Context Protocol (MCP) to turn Anthropic\u2019s Claude Desktop into a football opposition analyst, making advanced performance data accessible and actionable. \r\n\r\nTopics include how MCP enables AI to interpret domain-specific knowledge and real examples of AI-generated football insights.", "description": "Analysis in sports is changing. Advanced statistics like Wins Above Replacement (WAR) or Expected Goals (xG) are making their way into TV punditry and conversations in bars. But the people who need the information the most, ex-professionals and coaches without a background in statistics, often shun it.\r\n\r\nNot because they don't see the value, but because the language is impenetrable, the underlying data is overwhelming, and the insights are difficult to translate.\r\n\r\nGenerative AI provides an opportunity to bridge the gap.\r\n\r\nIn this talk, I'll share how I used Model Context Protocol (MCP) to turn Anthropic's Claude Desktop into a football opposition analyst by providing access to team and player performance event data, and in turn lower the barriers so anyone can turn a sea of numbers into actions.\r\n\r\nThis talk will cover:\r\n\r\n- How MCP enables AI to access and interpret domain-specific knowledge\r\n- Real examples of AI-generated football insights in action", "recording_license": "", "do_not_record": false, "persons": [{"code": "YR8YBP", "name": "Adam Cowley", "avatar": "https://cfp.pydata.org/media/avatars/YR8YBP_noR5QF6.webp", "biography": "Adam Cowley is Manager of Developer Education at Neo4j. He leads the team behind GraphAcademy, Neo4j\u2019s developer learning platform. His 20+ years of experience spans software engineering, data analysis, and product ownership. He is currently focused on applying Generative AI to create more personalised developer education.", "public_name": "Adam Cowley", "guid": "f234d7c2-d13d-569f-9468-833743a2fbfd", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/YR8YBP/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ETQTHC/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ETQTHC/", "attachments": []}, {"guid": "7a6221e4-7e62-59f4-a714-75691988009b", "code": "EKX7LV", "id": 78515, "logo": null, "date": "2025-12-10T12:00:00+00:00", "start": "12:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78515-the-human-side-leading-and-mentoring-global-data-teams-in-the-age-of-ai", "url": "https://cfp.pydata.org/pydataglobal2025/talk/EKX7LV/", "title": "The Human Side: Leading and Mentoring Global Data Teams in the Age of AI", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Building great AI-driven products starts with empowered teams. Hear proven strategies for leading, mentoring, and growing distributed engineering teams, with lessons in innovation, compliance, and diversity from global digital enterprises.", "description": "For engineering leaders, managers, and aspiring mentors. Session covers structures for remote work, upskilling, cross-cultural collaboration, promoting innovation, and embedding compliance and ethics in technical work\u2014from real executive experience.\r\n\r\nIf you would like hands-on tutorials for any of the 30-minute talks, or wish to tailor for a specific audience (engineering, product, executive), content can be customized to fit workshop/intermediate/advanced levels.", "recording_license": "", "do_not_record": false, "persons": [{"code": "P7BRUD", "name": "amar naik", "avatar": "https://cfp.pydata.org/media/avatars/P7BRUD_y2lPCx0.webp", "biography": "With over two decades of experience in the IT industry, I am a Strategic Engineering Leader with a deep focus on digital transformation, AI integration, and data-driven product innovation. My expertise lies in architecting intelligent systems that combine different tools/agents/systems, automation, and analytics to solve complex business challenges across sectors such as fintech, healthcare, and public services.\r\n\r\nOver the past few years, I\u2019ve led the development and deployment of multiple solutions that automate knowledge retrieval, orchestrate multi-step business workflows, and enhance human decision-making. I have used AI frameworks like LangChain, CrewAI, and ReAct to design scalable multi-agent systems that balance autonomy with control. These implementations have significantly improved operational efficiency, user experience, and stakeholder engagement.\r\n\r\nA strong advocate for practical, ethical, and secure AI adoption, I help organizations bridge the gap between emerging AI capabilities and enterprise readiness. I\u2019ve mentored global engineering teams and consultants in building AI-driven platforms, fostering a culture of innovation, experimentation, and continuous learning.\r\n\r\nMy passion lies in enabling businesses to move beyond automation and into intelligent collaboration \u2014 where human and agent teams co-create value. I am excited to contribute to the evolving AI ecosystem and share insights that empower teams to build resilient, human-aligned AI solutions", "public_name": "amar naik", "guid": "de47c9fb-ebca-52ed-b9e1-3f8c1c886344", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/P7BRUD/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/EKX7LV/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/EKX7LV/", "attachments": []}, {"guid": "75526f82-0995-50b4-9587-34f4d7246231", "code": "J7JK79", "id": 78692, "logo": null, "date": "2025-12-10T12:30:00+00:00", "start": "12:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78692-realtime-financial-fraud-detection-with-modern-python", "url": "https://cfp.pydata.org/pydataglobal2025/talk/J7JK79/", "title": "Realtime Financial Fraud Detection with Modern Python", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Building ML models for financial fraud detection sounds straightforward, until you have to evaluate, validate, and deploy them in real-world pipelines. This talk walks through the practical stack, metrics, and mindsets needed to build fraud detection systems with modern Python. We'll cover key challenges like concept drift, extreme class imbalance, false-positive overload, and why the usual ML workflows fall short. Along the way, we\u2019ll explore a real-world architecture using classical ML, deep learning, and GNNs, plus the validation techniques and production patterns that make or break fraud systems. If you're tired of toy problems and want patterns that survive real money and real latency, this talk\u2019s for you.", "description": "This talk distills a production\u2011tested path for real\u2011time financial fraud detection in Python (inc. choosing the right objective, validating in time, and shipping with guardrails).\r\n\r\nCore idea:\r\n\r\nOptimize the business decision (alerts under cost/latency constraints), not just the ML score.\r\n\r\nOutline (30 minutes):\r\n\r\n1. Problem framing: Adversaries, label delay, extreme imbalance, and why \u201caccuracy\u201d lies.\r\n\r\n2. Metrics that matter: Precision and recall, AUC\u2011PR vs ROC, cost\u2011weighted utility, calibration for decisions.\r\n\r\n3. Validation done right: Temporal splits, rolling/blocked CV with gap, prequential test\u2011then\u2011train, leak and drift traps.\r\n\r\n4. Modeling under latency budgets: Where XGBoost shines, when to add tabular DL, injecting graph signals without blowing latency (simple handcrafted graph stats + GNNs).\r\n\r\n5. From notebook to service: Small, testable core, FastAPI endpoint, thresholds and shadow mode, alert quotas, analyst feedback loops.\r\n\r\n6. Operations & monitoring: Drift indicators, calibration checks, label\u2011delay dashboards, canaries/rollbacks.\r\n\r\n7. Wrap\u2011up/Q&A: Failure modes and a 1\u2011page runbook.\r\n\r\nAttendee outcomes:\r\n\r\n- A copy\u2011and\u2011adapt roadmap for deploying financial fraud detection services with Python.\r\n\r\n- A latency\u2011aware model selection heuristic.\r\n\r\n- A minimal deployment pattern (service, thresholds, monitoring) that scales from pilot to production.\r\n\r\nPrior knowledge expected:\r\n\r\n- Basic Python and DataFrames, ML classification basics, HTTP/JSON.", "recording_license": "", "do_not_record": false, "persons": [{"code": "APLPNP", "name": "C\u00e9sar Soto Valero", "avatar": "https://cfp.pydata.org/media/avatars/APLPNP_fOgnjY2.webp", "biography": "C\u00e9sar is currently a Data Scientist at SEB Group, where he develops AI models to enhance the security of financial transactions on a global scale. He completed an M.Sc. in Machine Learning and moved to Sweden in 2018 to pursue a Ph.D. in Computer Science at KTH Royal Institute of Technology. During his five years at KTH, he pioneered open-source tools and techniques to mitigate software bloat, contributing to the efficiency and security of modern software systems. C\u00e9sar is deeply passionate about AI, science, and technology, with a strong focus on bridging cutting-edge research with real-world applications. He is dedicated to advancing AI\u2019s role in building smarter, more resilient systems that drive innovation.", "public_name": "C\u00e9sar Soto Valero", "guid": "e05271b4-d3ac-54f0-a38d-a3126e797cdf", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/APLPNP/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/J7JK79/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/J7JK79/", "attachments": []}, {"guid": "a7b19eae-15cb-5c4b-b293-59f97afb8e3a", "code": "W9RJKW", "id": 78489, "logo": null, "date": "2025-12-10T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78489-how-to-effectively-use-text-embeddings-in-tree-based-models", "url": "https://cfp.pydata.org/pydataglobal2025/talk/W9RJKW/", "title": "How to Effectively use text embeddings in tree based models", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Text embeddings are a powerful tool for encoding the essence of unstructured text data into a structured, dense, multidimensional vector representation. Due to their inner structure, tree based models such as decision trees, gradient boosted decision trees and random forests struggle to effectively use text embeddings features. This is due to the fact that trees can use only one feature every time they split, so the number of used embedding dimensions is limited to the tree depth.\r\n\r\nOther models, such as linear models for example, can use text embeddings more effectively because they are able to use all of the embedding dimensions simultaneously.\r\n\r\nIn this presentation we will present a novel approach to transform text embedding features into a format that tree-based models can effectively use. The proposed approach combines the strengths of non-tree based models with predictive power of tree based models to create a more effective feature representation for tree-based models.", "description": "The presentation is aimed at Data Science and Machine Learning practitioners who are already familiar with tree-based models and want to learn how to effectively incorporate text embeddings features to boost the performances of their models.\r\n\r\nThe methodology showcased in the presentation is available in the sklearo open source package.\r\n\r\nThe structure of the talk will be as follows:\r\n\r\n- **5 minutes** Overview of text embeddings, how tree-based models are built, and the challenges they face with text embeddings compared to linear models.\r\n- **5 minutes** Explanation of how can we leverage non-tree based models to transform text embeddings into a format that tree based models can effectively use.\r\n- **5 minutes** Explanation on *cross-fitting*, a technique used to avoid target leakage when generating features using the target variable.\r\n- **5 minutes** Code examples of how this technique can be used in practice using the `sklearo` open source library.\r\n- **5 minutes** Performance comparison of tree based models using text embeddings as-is vs using the transformed features.\r\n\r\nPrior knowledge about fundamental machine learning concepts such as overfitting, cross-validation, and feature engineering is recommended but not required.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DTAK73", "name": "Claudio Salvatore Arcidiacono", "avatar": "https://cfp.pydata.org/media/avatars/DTAK73_3fnumrk.webp", "biography": "I am Claudio, a Senior Data Scientist at Mollie. I have been working in the fintech sector over the past 7 years, I have lots of experience in classical machine learning problems, mainly in binary classification problems. I love to contribute to data science open source packages like feature engine, scikit-learn and narwhals. I maintain a couple of packages myself (felimination and sklearo). In my free time I am a coffee scientist, I use a data driven approach to dial in the perfect cup of espresso.", "public_name": "Claudio Salvatore Arcidiacono", "guid": "4bfb5293-9869-585e-90e7-b95cfa855d1a", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DTAK73/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/W9RJKW/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/W9RJKW/", "attachments": []}, {"guid": "3eb6c7b1-c505-59aa-bc40-d5f86a69d106", "code": "BT7M3S", "id": 79251, "logo": null, "date": "2025-12-10T13:30:00+00:00", "start": "13:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79251-optimal-variable-binning-in-logistic-regression", "url": "https://cfp.pydata.org/pydataglobal2025/talk/BT7M3S/", "title": "Optimal Variable Binning in Logistic Regression", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "In many regulated industries\u2014finance, healthcare, insurance\u2014logistic regression remains the model of choice for its interpretability and regulatory acceptability. Yet capturing non-linear effects and interactions often requires variable binning, and naive approaches (equal-width or quantile cuts) can either wash out signal or invite overfitting. In this 30-minute session, data scientists and risk analysts with a working knowledge of logistic regression and Python will learn to:\r\n\r\n-Diagnose the weaknesses of basic binning strategies.\r\n-Select and apply optimal-binning algorithms for different use cases.\r\n-Assess bin stability and guard against model overfit.\r\n\r\nAll code, data samples, and a notebook will be available on GitHub.", "description": "Despite the rise of complex \u201cblack-box\u201d models, regulated environments still demand transparency. Properly binned variables not only improve model fit but also yield coefficients that the business and auditors can interpret. However, determining cut-points that preserve true signal while avoiding data-snooping bias is non-trivial.\r\n\r\nBy the end of this session, attendees will be able to:\r\n\r\n- Understand the basic idea behind binning (the what)\r\n- To know in which contexts variable binning makes sense (the when and why).\r\n- Choose among popular optimal-binning techniques (e.g., ChiMerge, MDLP, decision-tree-based) based on data size, feature type, and operational constraints (the how).\r\n\r\nWho Should Attend?\r\n\r\nData scientists and risk analysts who use logistic regression in regulated settings and need a reproducible, explainable feature-engineering pipeline.\r\n\r\nDetailed 30-Minute Agenda\r\n\r\n| Time | Topic |\r\n| --- | --- |\r\n| 0\u20133 min | Context & Why Binning Matters in explainibility|\r\n| 3\u20138 min | Pitfalls of Na\u00efve Binning (examples from real-life) |\r\n| 8\u201318 min | Binning as an optimization problem :  Algorithms & Decision Criteria |\r\n| 18\u201326 min | Hands-On Python Demo: From Data to Defensible Bins |\r\n| 26\u201330 min | Q&A, Resources & Next Steps |\r\n\r\nPrerequisites & Materials\r\n\r\n- Prerequisites: Basic Python (pandas, scikit-learn) and logistic-regression familiarity\r\n- Materials: GitHub repo with notebook, data samples, will be shared during the talk\r\n\r\nYou\u2019ll leave equipped to choose the right optimal\u2010binning algorithm for your data.", "recording_license": "", "do_not_record": false, "persons": [{"code": "YPKRWZ", "name": "Charaf ZGUIOUAR", "avatar": "https://cfp.pydata.org/media/avatars/YPKRWZ_NFgouvt.webp", "biography": "Quantitative Finance and Econometrics Gradutate from Sorbonne's University. Currently working as Data Scientist at BNP Paribas & as lecturer at Sorbonne's University.", "public_name": "Charaf ZGUIOUAR", "guid": "d921573b-31de-577e-9715-37d31559f99c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/YPKRWZ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/BT7M3S/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/BT7M3S/", "attachments": []}, {"guid": "469beb67-eff8-56e8-af1c-b9880862901e", "code": "YPRZBE", "id": 78703, "logo": null, "date": "2025-12-10T14:00:00+00:00", "start": "14:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78703-bundestag-chat-discovering-political-landscape-with-rag-systems", "url": "https://cfp.pydata.org/pydataglobal2025/talk/YPRZBE/", "title": "Bundestag Chat: Discovering Political Landscape with RAG Systems", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Retrieval-Augmented Generation (RAG) systems are transforming how we interact with unstructured data using Large Language Models (LLMs). While it\u2019s now relatively easy to stand up a basic RAG prototype, deploying a robust, customizable, and production-ready system remains challenging.\r\nIn this talk, we present our open-source RAG blueprint through the lens of a real-world application: Bundestag Chat\u2014a system that enables users to explore and converse with German parliamentary speeches. We\u2019ll demonstrate how the blueprint streamlined development and scaling, and how its modular architecture allowed for seamless integration of components like LlamaIndex, Hugging Face embeddings, PGVector, Langfuse, and Ragas.\r\nAttendees will walk away with practical insights into customizing RAG pipelines for real use cases, whether building internal tools or user-facing applications. We\u2019ll also explore build-vs-buy trade-offs, retrieval and scaling strategies, and considerations around privacy, evaluation, and monitoring.", "description": "Retrieval-Augmented Generation (RAG) systems are among the most impactful applications of LLMs, allowing for intelligent querying and contextual understanding of unstructured data. However, turning a prototype into a polished, scalable product is often where complexity sets in.\r\n\r\nIn this talk, we walk through how our open-source RAG blueprint was used to create *Bundestag Chat*\u2014a system that allows users to interact with over a decade of German parliamentary debates via a chat interface. This real-world use case illustrates the key benefits of our blueprint: modularity, observability, evaluation, and scalability.\r\n\r\nOur architecture includes:\r\n\r\n- **LlamaIndex** for document parsing and chunking,\r\n- **Hugging Face embedding models** stored in a **PGVector** vector database,\r\n- **Chainlit** for an intuitive chat UI,\r\n- **Langfuse** for logging, observability, and feedback collection,\r\n- **Ragas** for evaluating response quality across dimensions like faithfulness and relevance.\r\n\r\nWhat made this system successful was the flexibility to swap components, configure data flows, and monitor performance from day one. This modular design made it straightforward to go from an initial prototype to a system deployed in a privacy-sensitive environment.\r\n\r\nWe\u2019ll also contrast open-source and commercial RAG stacks, sharing insights on when to build versus buy. Topics include:\r\n\r\n- Estimating system requirements across different workloads,\r\n- Evaluating model performance and output reliability,\r\n- Ensuring data privacy and legal compliance,\r\n- Gathering and acting on human feedback to improve quality.", "recording_license": "", "do_not_record": false, "persons": [{"code": "7SPANE", "name": "Piotr Kalota", "avatar": "https://cfp.pydata.org/media/avatars/7SPANE_dvAp4Ef.webp", "biography": "Piotr Kalota is a Machine Learning Engineer at FELD M with a Master\u2019s in Human-Centered AI from DTU. Specializing in NLP and accessible tech, he develops retrieval-augmented generation (RAG) systems and other LLM-driven solutions. With four years of experience in software engineering and machine learning, he combines human-centered design and innovation to create accessible AI solutions.", "public_name": "Piotr Kalota", "guid": "5b9e3aa9-a27e-5fa7-b983-6ead50fae692", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/7SPANE/"}, {"code": "G8XJHK", "name": "Matthias Boeck", "avatar": "https://cfp.pydata.org/media/avatars/G8XJHK_j10mTom.webp", "biography": "Dr. Matthias B\u00f6ck holds a doctorate in bioinformatics and machine learning and has been working as a data scientist in the Data Product department at the Munich-based consultancy FELD M since 2013. He is the technical manager for projects in the fields of machine learning and data strategy. He is the author of specialist books on AI, holds design thinking workshops and works with universities on research projects. In addition to these fields, he is also involved in the topic of data for good and its use in practice.", "public_name": "Matthias Boeck", "guid": "545f6528-a58a-5ae2-95e1-c5c721e4fe12", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/G8XJHK/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/YPRZBE/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/YPRZBE/", "attachments": []}, {"guid": "974ca86c-1a0f-5a84-8701-02b64fd93c23", "code": "GS9GQP", "id": 78626, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/GS9GQP/system_design_aE48_PHvigwv.webp", "date": "2025-12-10T16:00:00+00:00", "start": "16:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78626-building-production-ready-research-ai-assistants-with-one-command-setup", "url": "https://cfp.pydata.org/pydataglobal2025/talk/GS9GQP/", "title": "Building Production-Ready Research AI Assistants with One-Command Setup", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Academic research is often fragmented across dense PDFs, complex jargon, and scattered media articles, making it hard to access for students, interns, and the broader public. To address this, we introduce **Lab Lens**: an open-source Research AI Assistant that unifies a lab\u2019s papers and media coverage into a conversational system, where anyone can ask natural language questions and receive structured answers with full source citations.\r\n\r\nThis talk demonstrates how to build and deploy a production-ready RAG pipeline that uses Landing.AI for vision-based PDF parsing, Firecrawl for media extraction, and LangGraph for agentic orchestration. The entire system is containerized with FastAPI and Streamlit, launching with a single command: docker compose up.\r\n\r\nAttendees will learn how to turn scattered research artifacts into a transparent, queryable knowledge base, making lab insights accessible, reproducible, and conversational for all.", "description": "In this talk, we introduce Lab Lens: an open-source framework for Research AI Assistant that allows labs to ingest scientific papers and media coverage, build a vector database, and query it via natural language\u2014all in one reproducible command.\r\n\r\nThis 30-minute talk will explore:\r\n\r\n- **\ud83e\udde0 Architecture:** How LangGraph, FastAPI, and Streamlit are combined with agentic reasoning for document Q&A.\r\n- **\ud83d\udcc4 Multi-modal Ingestion:** How Lab Lens uses Landing.AI (vision agentic document extraction) and Firecrawl to intelligently extract content from complex PDFs and dynamic media pages.\r\n- **\ud83e\udd16 LLM Workflow:** How intents are classified, documents retrieved, and responses synthesized with structured JSON output and source attribution.\r\n- **\ud83d\udd04 Reusability and Extensibility:** How any lab or research group can plug in their own documents and deploy in minutes.\r\n- **\u2699\ufe0f One-Line Setup:** How a single YAML config and docker compose up sets up ingestion, vectorization, API, UI, and Slack bot integration.\r\n\r\nWe'll conclude with a live demo showing how Lab Lens answers real research questions using citation-backed reasoning, emphasizing transparency, reliability, and ease of use.\r\n\r\nLab Lens is designed for reproducibility, minimal setup, and immediate utility. If you're interested in bringing GenAI to your research workflow\u2014or your research to the world\u2014this talk will show you exactly how.\r\n\r\n**Target Audience:** Researchers, students, and enthusiasts wanting practical AI tools.\r\n\r\n**Prerequisites:**\r\n- Python knowledge\r\n- Familiarity with containerization concepts.\r\n\r\n**Resources Provided:** Complete open-source codebase with Docker configuration for immediate deployment.\r\nRemember that the main goal/advantage here is to make it accessible for the whole lab's documents (papers and media coverage), so anyone can ask about it with a source citation.", "recording_license": "", "do_not_record": false, "persons": [{"code": "UFPWF3", "name": "Cain\u00e3 Max Couto da Silva", "avatar": "https://cfp.pydata.org/media/avatars/UFPWF3_rZpSwsE.webp", "biography": "I\u2019m a data scientist and AI engineer with 10+ years of experience across academic research and industry, building GenAI and machine learning solutions for research labs, startups, and Fortune 500 companies. I\u2019m also a passionate educator, contributing to data training programs as a professor and consultant, and an active open-source contributor and speaker at conferences like SciPy and PyData.", "public_name": "Cain\u00e3 Max Couto da Silva", "guid": "bc2083a3-b173-5b81-963b-c9f43b337cb1", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/UFPWF3/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/GS9GQP/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/GS9GQP/", "attachments": []}, {"guid": "90ec6b8c-e92e-5eb6-96e8-7c169119fe01", "code": "TXYJHL", "id": 78727, "logo": null, "date": "2025-12-10T16:30:00+00:00", "start": "16:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78727-optimizing-ai-ml-workloads-resource-management-and-cost-attribution", "url": "https://cfp.pydata.org/pydataglobal2025/talk/TXYJHL/", "title": "Optimizing AI/ML Workloads: Resource Management and Cost Attribution", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "The proliferation of AI/ML workloads across commercial enterprises, necessitates robust mechanisms to track, inspect and analyze their use of on-prem/cloud infrastructure. To that end, effective insights are crucial for optimizing cloud resource allocation with increasing workload demand, while mitigating cloud infrastructure costs and promoting operational stability.\r\n\r\nThis talk will outline an approach to systematically monitor, inspect and analyze AI/ML workloads\u2019 properties like runtime, resource demand/utilization and cost attribution tags . By implementing granular inspection across multi-player teams and projects, organizations can gain actionable insights into resource bottlenecks, identify opportunities for cost savings, and enable AI/ML platform engineers to directly attribute infrastructure costs to specific workloads. \r\n\r\nCost attribution of infrastructure usage by AI/ML workloads focuses on key metrics such as compute node group information,  cpu usage seconds, data transfer, gpu allocation , memory and ephemeral storage utilization. It enables platform administrators to identify competing workloads which lead to diminishing ROI. Answering questions from data scientists like \"Why did my workload run for 6 hours today, when it took only 2 hours yesterday\" or \"Why did my workload start 3 hours behind schedule?\" also becomes easier.\r\n\r\nThrough our work on Metaflow, we will showcase how we built a comprehensive framework for transparent usage reporting, cost attribution, performance optimization, and strategic planning for future AI/ML initiatives. Metaflow is a human centric python library that enables seamless scaling and management of AI/ML projects.\r\n\r\nUltimately, a well-defined usage tracking system empowers organizations to maximize the return on investment from their AI/ML endeavors while maintaining budgetary control and operational efficiency. Platform engineers and administrators will be able to gain insights into the following operational aspects of supporting a battle hardened ML Platform:\r\n\r\n1.Optimize resource allocation: Understand consumption patterns to right-size clusters and allocate resources more efficiently, reducing idle time and preventing bottlenecks.\r\n\r\n2. Proactively manage capacity: Forecast future resource needs based on historical usage trends, ensuring the infrastructure can scale effectively with increasing workload demand.\r\n\r\n3. Facilitate strategic planning: Make informed decisions regarding future infrastructure investments and scaling strategies.\r\n\r\n4.Diagnose workload execution delays: Identify resource contention, queuing issues, or insufficient capacity leading to delayed workload starts.\r\n\r\nData Scientists on the other hand will gain clarity on factors that influence workload performance. Tuning them can lead to efficiencies in runtime and associated cost profiles.", "description": "This abstract proposes a framework for systematically monitoring and analyzing AI/ML workloads to optimize resource utilization and effective cost attribution/management. By providing granular insights into resource consumption, the system helps identify cloud infra bottlenecks - leading to lower resource contention while promoting fairer use of resources.  Built on Metaflow, this approach enables transparent usage reporting, improved performance, and strategic planning for future AI/ML initiatives. Ultimately, it empowers organizations to maximize ROI from their AI/ML investments while maintaining budgetary control and operational efficiency for both platform engineers and data scientists.", "recording_license": "", "do_not_record": false, "persons": [{"code": "XAYYZD", "name": "Saurabh Garg", "avatar": "https://cfp.pydata.org/media/avatars/XAYYZD_m2zVunv.webp", "biography": "I'm currently focused on building a frictionless Machine Learning Platform at Outerbounds, where our mission is to let data scientists and ML engineers stay focused on AI/ML development\u2014while we manage the infrastructure that powers it.\r\n\r\nMy background is in large-scale distributed systems, with experience spanning cloud infrastructure and identity/authorization systems. I've worked on infrastructure teams at Oracle Cloud and Outerbounds, and on IAM/authorization platforms at Atlassian and Databricks.\r\n\r\nAt Atlassian, I was part of the team that built a CQRS-based permissions system deployed across six AWS regions, handling 100K+ read requests with sub-3ms P99 latencies.\r\n\r\nAt Databricks, I founded and led a 6-engineer team focused on authorization. We transitioned the platform from a monolithic client-based model to a service-oriented architecture, integrating with ~35 internal services and achieving P99 latencies under 1 second for over 10K requests per second.\r\n\r\nOutside of engineering, I enjoy spending time with my daughter, and I'm always up for a game of cricket or table tennis.", "public_name": "Saurabh Garg", "guid": "1165a73a-7a58-5faf-9b6b-63fc01eb4c21", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/XAYYZD/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/TXYJHL/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/TXYJHL/", "attachments": []}, {"guid": "86d545ed-70fe-5bb4-8fcc-8dbb9292d8c6", "code": "YBZLZK", "id": 79296, "logo": null, "date": "2025-12-10T17:00:00+00:00", "start": "17:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79296-let-me-structure-freely-how-to-improve-llm-structured-output-quality", "url": "https://cfp.pydata.org/pydataglobal2025/talk/YBZLZK/", "title": "Let Me Structure Freely? How to Improve LLM Structured Output Quality", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Ever wonder why structured LLM output doesn\u2019t feel as reliable as its natural language responses?  At Khan Academy, we asked ourselves the same thing\u2014especially as we leaned heavily on JSON-based structured outputs to power our AI tutor, Khanmigo.\r\n\r\nSurprisingly, the root of the problem often lies in one of the most familiar tools in a Python developer\u2019s toolbox: the humble `dict`. In this talk, we follow the story of how dictionary ordering can shape (and sometimes distort) structured LLM output. We\u2019ll walk through how different frameworks\u2014OpenAI, Claude, LangChain, OpenRouter, vLLM\u2014handle structured responses, and why those differences matter more than you\u2019d expect.\r\n\r\nAlong the way, we\u2019ll share practical best practices we\u2019ve developed to improve structured output reliability, observe subtle failure cases, and debug weird edge behaviors. If you\u2019re building LLM apps with structured output, you\u2019ll leave with concrete tips\u2014and a deeper appreciation for the details that make or break your system.", "description": "Structured output (like JSON) is increasingly used in LLM applications to enforce a predictable schema and simplify downstream parsing. However, developers often assume that structured output is deterministic and robust\u2014until they run into subtle bugs. At Khan Academy, we\u2019ve run Khanmigo on structured JSON output since before it was even a supported feature. Along the way, we\u2019ve learned a lot about where things can go wrong.\r\n\r\nOur investigation began when we noticed inconsistent output quality across different LLM frameworks, even with identical prompts and models. The culprit? Python dictionary ordering and how different frameworks serialize JSON schemas.\r\n\r\nWe'll explore:\r\n\r\n* How Python's evolution from unordered (pre-3.7) to insertion-ordered dictionaries affects LLM frameworks, or how it lingers through other frameworks in (post-3.7)\r\n* Framework-specific serialization behaviors in OpenAI SDK, Anthropic SDK, LangChain, OpenRouter, and vLLM\r\n* Measurable impact on output quality through A/B testing results\r\n\r\nAttendees should have basic familiarity with Python and JSON, but no deep LLM expertise is required. We'll explain technical concepts clearly while providing actionable insights for immediate application.", "recording_license": "", "do_not_record": false, "persons": [{"code": "7XP8KU", "name": "Boris Lau", "avatar": "https://cfp.pydata.org/media/avatars/7XP8KU_R6ElNZX.webp", "biography": "Boris Lau currently serves as a Staff Software Engineer specializing in MLOps and Site Reliability Engineering (SRE) at Khan Academy.  His expertise in machine learning infrastructure and observability is critical for ensuring the performance and reliability of AI-driven products, such as Khanmigo.\r\n\r\nHe lives in Vancouver, Canada, and serves as an organizer for the local Vancouver PyData chapter.", "public_name": "Boris Lau", "guid": "67f0e434-3ca0-5d3c-9b98-a7afecf1ea4b", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/7XP8KU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/YBZLZK/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/YBZLZK/", "attachments": []}, {"guid": "582a55ae-726b-52a1-8fcf-1fbeb9d22e3f", "code": "SFG8MV", "id": 79208, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/SFG8MV/443775241-b1fd8715-_9IxJogQ.png", "date": "2025-12-10T17:30:00+00:00", "start": "17:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79208-build-your-own-personal-data-warehouse", "url": "https://cfp.pydata.org/pydataglobal2025/talk/SFG8MV/", "title": "Build your own Personal Data Warehouse", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Tired of paying for cloud compute just to view your own data? Discover how to build a completely free, open-source personal data warehouse that runs entirely on your machine.\r\n\u2013 Import data from Excel, CSV, SQL Server, and Microsoft Fabric\r\n\u2013 Use AI-powered Python/C# code for advanced data transformations\r\n\u2013 Generate SSRS-style reports \u2013 no cloud required\r\n\u2013 Leverage local compute power to avoid cloud costs", "description": "Typically, a data warehouse operates in the cloud. When you access your data\u2014even just to view it\u2014you incur compute costs (in other words, you pay). But you already have a computer with a CPU and memory capable of handling most tasks. Wouldn\u2019t it be great to view, edit, and transform your data right on your own machine? Now you can!\r\n\r\nThis free open source application allows you to import your data, transform it using AI to create python code to perform calculations, and report and export the results.\r\n\r\nIn this talk, Microsoft MVP Michael Washington shows how to:\r\n\u2013 Import data from Excel, CSV, SQL Server, and Microsoft Fabric\r\n\u2013 Use AI-powered Python/C# code for advanced data transformations\r\n\u2013 Generate SSRS-style reports \u2013 no cloud required\r\n\u2013 Leverage local compute power to avoid cloud costs\r\n\r\nWhether you\u2019re a developer, analyst, or data enthusiast, this session will help you take full control of your data with zero hosting fees. Live demos included!", "recording_license": "", "do_not_record": false, "persons": [{"code": "NLGAHA", "name": "Michael Alan Washington", "avatar": "https://cfp.pydata.org/media/avatars/NLGAHA_8QyROdW.webp", "biography": "Michael Washington is a Microsoft MVP and an ASP.NET C# Microsoft Blazor programmer. He has extensive knowledge in artificial intelligence, and student information systems. He is the founder of BlazorData.net.", "public_name": "Michael Alan Washington", "guid": "912e60b9-8ed8-546a-a615-31b9c9243576", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/NLGAHA/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/SFG8MV/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/SFG8MV/", "attachments": []}, {"guid": "f4459d59-7150-50c5-bc4d-68c6af0df8f8", "code": "ARAZTG", "id": 79565, "logo": null, "date": "2025-12-10T18:00:00+00:00", "start": "18:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79565-llms-chatbots-and-dashboards-visualize-your-data-with-natural-language", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ARAZTG/", "title": "LLMs, Chatbots, and Dashboards: Visualize Your Data with Natural Language", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "LLMs have a lot of hype around them these days. Let's demystify how they work and see how we can put them in context for data science use. As data scientists, we want to make sure our results are inspectable, reliable, reproducible, and replicable. We already have many tools to help us in this front. However, LLMs provide a new challenge; we may not always be given the same results back from a query. This means trying to work out areas where LLMs excel in, and use those behaviors in our data science artifacts. This talk will introduce you to LLms, the Chatlas package, and how they can be integrated into a Shiny to create an AI-powered dashboard. We'll see how we can leverage the tasks LLMs are good at to better our data science products.", "description": "This talk plans to provide data scientists the tools and techniques needed to integrate AI into their data products. Specifically around how to use APIs to work with chat providers and show where and how we can leverage tasks LLMs are good at to make sure we are confident with their output.\r\n\r\nTalk breakdown:\r\n\r\n0-5: introduction and where we can push LLMs\r\n5-10: Example of tasks where the LLM can do well, and where can fail (in a data science context)\r\n10-15: brief introduction to the Chatlas package\r\n15-20: brief introduction on Shiny dashboards and integrating chatlast into Shiny\r\n20-25: demo + example of putting everything together and how we can create an LLM-powered data science product.\r\n25-30: Q+A / overflow", "recording_license": "", "do_not_record": false, "persons": [{"code": "7BJQSY", "name": "Daniel Chen", "avatar": "https://cfp.pydata.org/media/avatars/7BJQSY_sLwbtih.webp", "biography": "I am a lecturer at The University of British Columbia and data science educator at Posit, PBC. I love teaching tools to empower data scientists.", "public_name": "Daniel Chen", "guid": "95e59883-6f50-50d0-865b-e52f6fd847b0", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/7BJQSY/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ARAZTG/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ARAZTG/", "attachments": []}, {"guid": "acf50111-a413-5f6f-a0a0-4f7ecfba27d4", "code": "8U7WLS", "id": 78742, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/8U7WLS/uqlm_graphic_NwE6pkZ.png", "date": "2025-12-10T18:30:00+00:00", "start": "18:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78742-uqlm-detecting-llm-hallucinations-with-uncertainty-quantification-in-python", "url": "https://cfp.pydata.org/pydataglobal2025/talk/8U7WLS/", "title": "UQLM: Detecting LLM Hallucinations with Uncertainty Quantification in Python", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "As LLMs become increasingly embedded in critical applications across healthcare, legal, and financial domains, their tendency to generate plausible-sounding but false information poses significant risks. This talk introduces UQLM, an open-source Python package for uncertainty-aware generation that flags likely hallucinations without requiring ground truth data. UQLM computes response-level confidence scores from token probabilities, consistency across sampled responses, LLM judges, and tunable ensembles. Attendees will learn practical strategies for implementing hallucination detection in production systems and leave with code examples they can immediately apply to improve the reliability of their LLM-powered applications. No prior uncertainty quantification background required.", "description": "### Objective.\r\nShow how to add uncertainty-aware controls to LLM apps using UQLM so practitioners can detect and handle hallucinations at generation time without ground truth data.\r\n\r\n### Context and Gap.\r\nMany hallucination detection methods assume existence of ground truth data, which is rarely available in production. Research has proposed ground-truth-free uncertainty quantification (UQ) techniques, but adoption suffers from fragmented tooling. UQLM packages these methods behind a simple API and provides a versatile suite of UQ-based confidence scorers that work across tasks.\r\n\r\n### What you will see.\r\n- Black-box UQ via response consistency from multiple samples\r\n- White-box UQ from token log probabilities\r\n- LLM-as-a-judge scoring\r\n- Ensemble tuning and threshold selection for your use case\r\n- Patterns for routing: block, warn, or escalate to human review\r\n\r\n### Outline (30 minutes total).\r\n- 0\u20134: Why hallucinations matter in production \r\n- 4\u20138: Limits of traditional hallucination detection approaches and where UQ fits\r\n- 8\u201320: UQLM walkthrough and code examples\r\n- 20\u201324: Choosing thresholds and tuning ensembles\r\n- 24\u201327: Results on several use cases and interpreting confidence\r\n- 27\u201330: Q&A\r\n\r\n### Expected background. \r\nBasic familiarity with LLMs and machine learning. No prior uncertainty quantification knowledge required.\r\n\r\n### Key takeaways.\r\n- When and why ground-truth-free hallucination detection is useful in production\r\n- How to add UQLM to a Python app in a few lines of code\r\n- Pros and cons of consistency-based, token-probability-based, and judge-based methods\r\n- Practical guidance on thresholds, ensemble tuning, and handling low-confidence outputs", "recording_license": "", "do_not_record": false, "persons": [{"code": "KGBSGD", "name": "Dylan Bouchard", "avatar": "https://cfp.pydata.org/media/avatars/KGBSGD_22aQQbY.webp", "biography": "Dylan Bouchard is a Principal Applied Scientist focusing on AI Research & Open Source at CVS Health. He leads the company's Responsible AI Research program, where he developed two impactful open source libraries: UQLM, a toolkit for detecting hallucinations in large language models, and LangFair, a framework for evaluating bias and fairness in LLMs. His work bridges academic research with practical tools that help make AI systems more reliable and equitable.", "public_name": "Dylan Bouchard", "guid": "d8490d98-81e1-553b-b666-7eb3ee3a4f5c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/KGBSGD/"}, {"code": "ANPNZZ", "name": "Mohit Singh Chauhan", "avatar": "https://cfp.pydata.org/media/avatars/ANPNZZ_6JIHIGO.webp", "biography": "I am Senior Data Scientist at CVS Health and works in the Responsible AI and LLM/Agentic systems. My expertise lies in the technical aspects of ethical AI, with a particular focus on bias and fairness testing. I am dedicated to identifying and mitigating biases in AI systems to ensure they are fair and equitable for all users. Additionally, I specialize in hallucination detection and mitigation for large language models (LLMs), multi-modal models, and AI agents, striving to enhance the reliability and trustworthiness of these advanced technologies. The recent cutting-edge tools includes open-source libraries like LangFair and UQLM.", "public_name": "Mohit Singh Chauhan", "guid": "60ac47bb-adff-58f7-8ed5-e3d25fba4de2", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/ANPNZZ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/8U7WLS/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/8U7WLS/", "attachments": []}], "Analytics, Visualization & Decision Science": [{"guid": "d5a4e46e-0abd-5a70-a377-7d5235d2cc24", "code": "ZS37FH", "id": 78538, "logo": null, "date": "2025-12-10T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78538-reviving-survival-analysis-timeless-yet-overlooked", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ZS37FH/", "title": "Reviving Survival Analysis: Timeless, Yet Overlooked?", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "Survival analysis tackles one of the oldest and most universal questions in data science: Can we learn from the past when something will happen in the future? I will introduce you to the core concepts of survival analysis, visualize time-to-event datasets with python and R, and introduce pertinent probability distributions. Classical analysis methods for fitting such datasets - some developed long before the age of modern computing - will be confronted to machine-learning approaches. Along the way, surprising paradoxes and counterintuitive results will reveal why survival analysis is not merely a blend of regression and classification, but an important prediction problem of its own.", "description": "Since at least 1693, when the first actuarial tables were used for calculating insurance premiums, survival (or \"time-to-event\") analysis has been relevant for many disciplines. Whether predicting when a mechanical component will fail, when a patient will recover, or when a customer will return a product, survival analysis has applications in nearly every domain - from engineering and medicine to finance and e-commerce. Despite its broad applicability and deep statistical foundations, survival analysis remains underappreciated in modern data science.\r\n\r\nI therefore want to give the audience, who does not need to have heard of survival analysis before, an impression about what survival analysis is about, what one needs to be careful with, and which analytical and computational tools to use to get to reliable predictions. In a step-by-step constructive approach, I will slowly guide the audience from the simplest flavor of the fully observed time-to-event-problem to the more intricate versions that include censoring and truncation, in which managing one's own ignorance becomes the most important and challenging aspect. Numerous code examples in python and R will make the talk hands-on, and allow listeners to replicate the numerical experiments and visualizations. At the same time, I will constantly recur to lucid everyday-examples (what age should the house that you buy have so you avoid problems? how long can you use your winter tires on your car? why is milk often still good after the best-before date?) - and thereby hopefully convince the audience: Survival analysis is almost always everywhere.\r\n\r\nOutline: \r\n\r\n- Motivation: The oldest problem in data science? [1 min]\r\n- Introduction: Prediction problems that are in fact survival problems? [3 min]\r\n- The simple case: Fully observed datasets. Visualization of the cumulative failure distribution. [3 min]\r\n- The Weibull distribution as the working horse of survival analysis: How to model early failures, constant risks and wear-outs. [4 min]\r\n- Why reporting another case of illness can be good news. [2 min]\r\n- Censoring: What can we learn from not having observed anything yet? [2 min]\r\n- The Kaplan-Meier estimator and the maximum-likelihood principle. [5 min]\r\n- Machine Learning approaches to the survival problem. [3 min]\r\n- Outlook: Which degree of individualized survival forecasts can we expect in the future? [2 min]\r\n\r\nAfter the talk, the audience will be able to recognize the time-to-event problem in their own domain, and use the appropriate tools in python and R to analyze and model it.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WXFBXT", "name": "Malte Tichy", "avatar": "https://cfp.pydata.org/media/avatars/WXFBXT_9EUqk9N.webp", "biography": "Malte Tichy has a research background in theoretical quantum physics, with a PhD from\r\nthe University of Freiburg. He learned the nuts and bolts of applied data science and forecasting within various hands-on and leadership roles at the supply chain software company Blue Yonder. As a Discipline Expert in Data Analytics & AI, he works on forecasts for wind-turbine component reliability and maintenance expenditures at Siemens Gamesa Renewable Energy.", "public_name": "Malte Tichy", "guid": "f4055832-e904-5529-bab8-d968db7fc2e0", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/WXFBXT/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ZS37FH/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ZS37FH/", "attachments": []}, {"guid": "c276c648-e1b1-5eec-8d9c-5ea0ce3d87dd", "code": "S7PC89", "id": 78336, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/S7PC89/monty-hall-goat-lap_tIcFNgr.png", "date": "2025-12-10T14:00:00+00:00", "start": "14:00", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78336-lessons-in-decision-making-from-the-monty-hall-problem", "url": "https://cfp.pydata.org/pydataglobal2025/talk/S7PC89/", "title": "\ud83d\udeaa\ud83d\udeaa\ud83d\udc10 Lessons in Decision Making from the Monty Hall Problem", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "Switch or stay, what do you say? And more importantly, why?\r\n\r\nThe Monty Hall Problem is a well-known brain teaser from which we can learn important lessons in decision making that are useful in general and in particular for data scientists.\r\n\r\nIf you are not familiar with this problem, prepare to be perplexed \ud83e\udd2f. If you are, I hope to shine light on aspects that you might not have considered \ud83d\udca1.\r\n\r\nI introduce the problem and solve with three types of intuitions: Common, Bayesian and Causal. I summarise with a discussion on lessons learnt for better data decision making.", "description": "Imagine you're a contestant on a game show. Three doors stand before you: behind one is a prize car, behind the other two are goats. You choose a door, and the host\u2014who knows what's behind each\u2014reveals a goat behind one of the doors you didn\u2019t pick. Now you're asked: \"Do you want to switch your choice or stay?\"\r\n\r\nThis is the essence of the Monty Hall Problem, a classic puzzle that famously baffles our intuitions about probability. While it may seem like just a fun brain teaser, it offers profound lessons for decision-making under uncertainty.\r\n\r\nIn this talk, we'll break down the Monty Hall Problem, explore its counterintuitive nature, and uncover what it teaches us about probabilistic reasoning and critical thinking. Together, we'll navigate multiple perspectives.\r\n\r\nKey Topics:\r\n* The Monty Hall Problem: Origins, setup, and why it confuses even experts\r\n* Misconceptions and cognitive biases: Why our gut reactions often lead us astray\r\n* Bayesian thinking: The power of belief updating in uncertain scenarios\r\n* Information theory: How the host's actions reveal hidden information\r\n* Causal reasoning: A fresh lens for understanding the game's dynamics\r\n* Real-world takeaways: Applying these lessons to practical decision-making\r\n\r\nBy the end of this session, attendees will gain:\r\n\r\n* A clear understanding of the Monty Hall Problem and its solution\r\n* Insights into the pitfalls of intuitive probability judgments\r\n* Strategies for approaching complex decisions and probabilistic reasoning\r\n\r\nThis session is for data scientists, analysts, and decision-makers at all experience levels. No advanced math is required\u2014just curiosity and a willingness to rethink what you know about probability.\r\n\r\nJoin me to discover how a seemingly trivial game show puzzle can sharpen your decision-making skills and elevate your approach to statistics, data science, and beyond.\r\n\r\nI have summarised this talk in this publication: [bit.ly/mh-lessons](https://bit.ly/mh-lessons).", "recording_license": "", "do_not_record": false, "persons": [{"code": "NLRPSE", "name": "Eyal Kazin", "avatar": "https://cfp.pydata.org/media/avatars/NLRPSE_Rfg6kAq.webp", "biography": "I'm an Ex-cosmologist turned data scientist with 20 years experience in solving challenging problems. I am motivated by intellectual challenges, highly detail oriented and love visualising data results to communicate insights for better decisions within organisations.\r\n\r\nMy main drive is applying scientific approaches that result in practical and clear solutions. To accomplish these, I use whatever works, be it statistical/causal inference, machine/deep learning or optimisation algorithms. Being result driven I have a passion for facilitating stakeholders to make data driven decisions by quantifying and communicating the impact of interventions to non-specialist audiences in an accessible manner.\r\n\r\nIn my free time I craft engaging articles on applied stats in data science and machine learning: https://medium.com/@eyal-kazin\r\n\r\nMy claim for fame is that between 2004-2014 I lived in four different continents within a span of a decade, including three tennis Grand Slam cities (NYC, Melbourne, London).", "public_name": "Eyal Kazin", "guid": "5cebb4b6-1507-5b10-97e3-2ee38a6f53ef", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/NLRPSE/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/S7PC89/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/S7PC89/", "attachments": []}, {"guid": "459ca2a9-0fb1-5748-a606-180d563e98cf", "code": "J9JCL9", "id": 77955, "logo": null, "date": "2025-12-10T16:00:00+00:00", "start": "16:00", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-77955-decisions-under-uncertainty-a-hands-on-guide-to-bayesian-decision-theory", "url": "https://cfp.pydata.org/pydataglobal2025/talk/J9JCL9/", "title": "Decisions Under Uncertainty: A Hands\u2011On Guide to Bayesian Decision Theory", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "We often must make decisions under uncertainty\u2014should you carry an umbrella if there's a 30\u202f% chance of rain? Bayesian decision theory provides a principled, probabilistic framework to answer such questions by combining beliefs (probabilities), utilities (what matters to us), and actions to maximize expected gain.\r\n\r\nThis talk:\r\n- Introduces key decision\u2011theoretic concepts in intuitive terms.\r\n- Uses a toy umbrella example to ground ideas in relatable context.\r\n- Demonstrates applications in Bayesian optimization (PoI/EI) and Bayesian experimental design.\r\n- Is hands\u2011on\u2014with Python code and practical tools\u2014so participants leave ready to apply these ideas to real\u2011world problems.", "description": "This talk bridges everyday decision-making (umbrella example) with advanced techniques like Bayesian optimization and experimental design, and equips attendees with conceptual clarity and immediate code they can adapt to their data-driven workflows.\r\n\r\n## Audience\r\n\r\nPrimarily data scientists, ML practitioners, and statisticians who:\r\n\r\n- Have applied Bayesian models but want a broader decision-theory perspective.\r\n- Want actionable insight into uncertainty-aware decision frameworks.\r\n- Seek practical demos in Python.\r\n\r\n## Outline\r\n\r\n### Motivation & Core Concepts (5\u202fmin)\r\n\r\n- Frame real-world decision problems: rain or shine, clinical trials, A/B testing.\r\n- Introduce Bayesian decision theory: beliefs \u00d7 utilities \u2192 action via expected utility maximization.\r\n\r\n### Toy Example: Should I Bring an Umbrella? (8\u202fmin)\r\n\r\n- Define: Probabilityp of rain; utility/loss matrix\r\n\r\n| Action      | Rain         | No Rain            |\r\n| ----------- | ------------ | ------------------ |\r\n| Umbrella    | \u20131 (weight)  | \u20131 (inconvenience) |\r\n| No Umbrella | \u201310 (soaked) | 0                  |\r\n\r\n- Derive expected utility:\r\n```\r\nEU_umbrella = -1\r\nEU_no_umbrella = -10p\r\n```\r\n\r\nSo bring umbrella if p > 0.1.\r\n\r\n- Interactive Python demo: explore how p and utility values shift the decision point.\r\n\r\n### Bayesian Optimization: PoI & EI (8 min)\r\n\r\n- Introduce Gaussian-process-based optimization and the need to trade off exploration vs. exploitation.\r\n- Define Probability of Improvement (PoI) and Expected Improvement (EI)\r\n- Show how they're derived from decision theory: choosing the next point to maximize expected gain.\r\n- Python demo using GPyTorch: fit GP, compute PoI/EI acquisition functions, visualize decision boundary\u2014why one chooses a high-uncertainty point vs. one near known good values.\r\n\r\n### Bayesian Experimental Design (BED): Minimizing Uncertainty (8 min)\r\n\r\n- Motivation: cost-sensitive data collection (labeling, surveys, medical tests).\r\n- Define an information-based utility (e.g., expected reduction in entropy).\r\n- Show how decision theory prescribes choosing the next experiment to maximize this expected utility.\r\n- Python demo using OptBayesExpt.\r\n\r\n\r\n### Summary & Takeaways (1 min)\r\n\r\n- Reiterate the decision-theoretic arc: belief \u2192 utility \u2192 action.\r\n- Emphasize the unifying framework across umbrella example, optimization, and experimental design.\r\n- Share resources & practical tips: GPyTorch / scikit-optimize, OptBayesExpt", "recording_license": "", "do_not_record": false, "persons": [{"code": "79DLSQ", "name": "Quan Nguyen", "avatar": "https://cfp.pydata.org/media/avatars/79DLSQ_3UpOLvK.webp", "biography": null, "public_name": "Quan Nguyen", "guid": "51c373bc-168e-5337-b433-79d7f321a7b3", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/79DLSQ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/J9JCL9/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/J9JCL9/", "attachments": []}, {"guid": "58f7a4d2-9479-5afb-ae55-e61dbdb29077", "code": "8RUFNS", "id": 78560, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/8RUFNS/fastplotlib_SSa-R02_RTgO3OW.png", "date": "2025-12-10T17:00:00+00:00", "start": "17:00", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78560-fastplotlib-driving-scientific-discovery-through-data-visualization", "url": "https://cfp.pydata.org/pydataglobal2025/talk/8RUFNS/", "title": "fastplotlib: driving scientific discovery through data visualization", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "Fast interactive visualization remains a considerable barrier in analyses pipelines for large neuronal datasets. Here, we present fastplotlib, a scientific plotting library featuring an expressive API for very fast visualization of scientific data. Fastplotlib is built upon pygfx which utilizes the GPU via WGPU, allowing it to interface with modern graphics APIs such as Vulkan for fast rendering of objects. Fastplotlib is non-blocking, allowing for interactivity with data after plot generation. Ultimately, fastplotlib is a general purpose scientific plotting library that is useful for the fast and live visualization and analysis of complex datasets.", "description": "Over the past decade, advanced analyses pipelines have been developed for the analysis of large datasets. However, fast visualization and live interactivity during data collection remains challenging. While current tools within the Python plotting ecosystem allow for interactive data visualization, they either fail to leverage modern GPUs efficiently, lack intuitive APIs for rapid prototyping, or require users to write their own shaders. Additionally, other popular plotting libraries, such as bokeh and matplotlib, are not geared towards fast interactive visualization with millions of objects. Given these challenges with current visualization tools, the need for a modern GPU-driven interactive plotting library exists. In this presentation, we will go through the technical details, as well as a brief demo on how fastplotlib makes fast interactive visualization of complex datasets possible. We will demonstrate the broad applicability of fastplotlib as a fast, general-purpose plotting library.\r\nFastplotlib is built on top of pygfx which is a cutting edge Python rendering engine that utilizes WGPU, which can efficiently leverage modern GPU and CPU hardware. WGPU is the successor to OpenGL and features a low overhead with respect to the amount of code per-draw-per-object allowing for speed even when rendering millions of objects. Pygfx is also non-blocking, which allows for interactivity and modification of already drawn objects. Fastplotlib utilizes the pygfx rendering library for fast visualization with an expressive API for scientific visualization. The benefits of fastplotlib are that it reduces boilerplate code which allows users to focus on their data without having to manage the underlying rendering process. Additionally, fastplotlib allows for animations as well as high-level interactivity among plots, which can be combined with lazy loading and lazy compute of very large datasets that are hundreds of gigabytes or terabytes in size. Furthermore, fastplotlib can be used in jupyter notebooks, allowing it to be used on cloud computing and other remote infrastructures for streaming visualizations of extremely large datasets. In total, these unique features and the underlying architecture create a plotting library that is fast, easy to use, and multifaceted.", "recording_license": "", "do_not_record": false, "persons": [{"code": "RMCH38", "name": "Kushal Kolar", "avatar": "https://cfp.pydata.org/media/avatars/RMCH38_vzXri6Z.webp", "biography": "PhD Candidate at NYU. 10+ years of experience using Python for data analysis and machine learning with neuroscience datasets. Core developer of fastplotlib and maintainer of several Python libraries in neuroscience with significant user bases, and a contributor to other libraries such as tslearn and CaImAn.", "public_name": "Kushal Kolar", "guid": "e2ed84c2-a945-56c6-b6a5-9b3f881e9edd", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/RMCH38/"}, {"code": "DSW7Y8", "name": "Caitlin Lewis", "avatar": "https://cfp.pydata.org/media/avatars/DSW7Y8_Vz9X6LA.webp", "biography": null, "public_name": "Caitlin Lewis", "guid": "ae7f5f53-235b-5a47-834b-83ead7b9e368", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DSW7Y8/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/8RUFNS/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/8RUFNS/", "attachments": []}, {"guid": "da1bbbf8-114a-525f-ba1a-348f2958568e", "code": "NJNHQB", "id": 78434, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/NJNHQB/Screenshot_2025-07_dXbnZjS.webp", "date": "2025-12-10T17:30:00+00:00", "start": "17:30", "duration": "01:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78434-bayesian-decision-analysis-with-pymc-beyond-a-b-testing", "url": "https://cfp.pydata.org/pydataglobal2025/talk/NJNHQB/", "title": "Bayesian Decision Analysis with PyMC: Beyond A/B Testing", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Tutorial", "language": "en", "abstract": "This hands-on tutorial introduces practical Bayesian inference using PyMC, focusing on A/B testing, decision-making under uncertainty, and hierarchical modeling. With real-world examples, you'll learn how to build and interpret Bayesian models, evaluate competing hypotheses, and implement adaptive strategies like Thompson sampling. Whether you're working in marketing, healthcare, public policy, UX design, or data science more broadly, these techniques offer powerful tools for experimentation, decision-making, and evidence-based analysis.", "description": "Bayesian methods offer a natural and interpretable framework for updating beliefs with data, and PyMC makes it easy to apply these techniques in practice. In this tutorial, we\u2019ll walk through a series of examples that demonstrate the core concepts:\r\n\r\n1. Bayesian A/B Testing with the Beta-Binomial Model\r\n\r\n  * Represent prior beliefs with the beta distribution  \r\n  * Use binomial likelihoods to model observed outcomes\r\n  * Understand posterior distributions and credible intervals\r\n\r\n2. Bayesian Bandits and Thompson Sampling\r\n\r\n  * Go beyond hypothesis testing: estimate the probability of one version outperforming another\r\n  * Use Thompson sampling to guide decision-making\r\n  * Simulate and visualize an adaptive email campaign\r\n\r\n3. Hierarchical Models for Partial Pooling and Prediction\r\n\r\n  * Learn how to share information across variants\r\n  * Use posterior predictive distributions to quantify uncertainty\r\n  * Understand second-order probabilities\r\n\r\nHands-On Learning\r\n\r\nParticipants will follow along in Jupyter notebooks (hosted on Colab \u2014 no installation required). Exercises are embedded throughout, with guided solutions. Code is based on PyMC, ArviZ, and standard scientific Python libraries.\r\n\r\nPrerequisites\r\n\r\n  * Intermediate Python: basic familiarity with NumPy, plotting, and Jupyter notebooks\r\n  * No prior experience with Bayesian statistics or PyMC is assumed\r\n  * All materials run on Colab (no setup required)", "recording_license": "", "do_not_record": false, "persons": [{"code": "CYQ9UX", "name": "Allen Downey", "avatar": "https://cfp.pydata.org/media/avatars/CYQ9UX_c95J2pT.webp", "biography": "Allen Downey is a principal data scientist at PyMC Labs and professor emeritus at Olin College. He is the author of several books including Think Python, Think Bayes, and Probably Overthinking It -- and a blog about programming and data science. He received a Ph.D. in computer science from the University of California, Berkeley, and Bachelor's and Masters degrees from MIT.", "public_name": "Allen Downey", "guid": "2d9e8c6d-b9d8-5358-9ce8-f1336f142f44", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/CYQ9UX/"}], "links": [{"title": "The material for the tutorial is in this repository", "url": "https://github.com/AllenDowney/BDAwithPyMC", "type": "related"}], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/NJNHQB/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/NJNHQB/", "attachments": []}], "Data Engineering & Infrastructure": [{"guid": "fe2bdbfb-e8c5-5fc7-8122-95a951da0726", "code": "EXUXFR", "id": 78639, "logo": null, "date": "2025-12-10T12:00:00+00:00", "start": "12:00", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78639-getting-big-openstreetmap-data-with-quackosm", "url": "https://cfp.pydata.org/pydataglobal2025/talk/EXUXFR/", "title": "Getting big OpenStreetMap data with QuackOSM", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "[OpenStreetMap](https://www.openstreetmap.org/) data is publicly available, but it's hard to get it downloaded at scale without domain knowledge and an external technology stack.\r\n\r\nWith [QuackOSM](https://github.com/kraina-ai/quackosm), you can easily work with whole-country vector and tag data without installing additional dependencies - come and find out how you can use it in your next project!", "description": "[QuackOSM](https://github.com/kraina-ai/quackosm) is a powerful and user-friendly library that streamlines the process of accessing and manipulating OpenStreetMap (OSM) vector and tags data. It's using the [DuckDB](http://duckdb.org/) engine with its [Spatial extension](https://duckdb.org/docs/extensions/spatial/overview), and PyArrow library that enables users to efficiently retrieve large-scale OSM data in the GeoParquet format.\r\n\r\nIt's similar in functionality to other available libraries, but it's faster, can work with bigger than memory datasets and doesn't require any additional dependencies.\r\n\r\n---\r\n\r\nTarget audience:\r\nData engineers/analysts/scientists who have worked with or want to work with geospatial data.\r\n\r\n---\r\n\r\nOutline:\r\n- Brief OpenStreetMap data introduction\r\n- Introduction to DuckDB and PyArrow\r\n- Why is it hard to work with big OSM datasets? Introduction to the OpenStreetMap data schema and PBF format.\r\n- QuackOSM overview: basic usage, data filtering, example use-cases + benchmark against available libraries (OSMnx, Pyrosm, PyDriosm and others).\r\n- Example of a simple ML model built on top of geospatial data", "recording_license": "", "do_not_record": false, "persons": [{"code": "AEKH3Q", "name": "Kamil Raczycki", "avatar": "https://cfp.pydata.org/media/avatars/AEKH3Q_k2VTnhw.webp", "biography": "Geospatial Data Scientist with a drive to contribute to the open-source space. Co-developer of SRAI library and maintainer of QuackOSM and OvertureMaestro libraries.\r\nInterested in exploring how machine learning models with geospatial data can improve our lives.", "public_name": "Kamil Raczycki", "guid": "cdbe27e7-54e5-51f8-9555-abcc57e7acc7", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/AEKH3Q/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/EXUXFR/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/EXUXFR/", "attachments": [{"title": "Slides", "url": "/media/pydataglobal2025/submissions/EXUXFR/resources/PyData_Gl_iFQF8r4.pdf", "type": "related"}]}, {"guid": "f2d6c345-257b-543c-bae3-589c12adaae3", "code": "YTYRLZ", "id": 78691, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/YTYRLZ/rdepot_pydata_2025_w6QusCB.png", "date": "2025-12-10T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78691-rdepot-100-open-source-enterprise-management-of-python-and-r-repositories", "url": "https://cfp.pydata.org/pydataglobal2025/talk/YTYRLZ/", "title": "RDepot - 100% open source enterprise management of Python and R repositories", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "[RDepot](https://rdepot.io) is a solution for the management of R package repositories in an enterprise environment. Python support has recently been implemented and this talk will introduce RDepot to the Python community. It allows to submit packages through a user interface or API and to automatically update and publish Python and R repositories. In this talk we will walk Python users and developers through different features of RDepot and demonstrate how these can be useful in different scenarios.", "description": "[RDepot](https://rdepot.io) is a solution for the management of Python and R package repositories in an enterprise environment.\r\nIt allows to submit packages through a user interface or API and to automatically update and publish Python and R repositories.\r\nMultiple departments can manage their own repositories and different users can have different roles in the management of their packages.\r\nWith continuous integration infrastructure for quality assurance on Python and R packages, package uploads can be automated.\r\nAll configuration is declarative and RDepot can be set up as infrastructure as code, which is especially relevant in regulated contexts, since it makes validation activities much easier.\r\nPackages from publicly available Python repositories such as [PyPi](https://pypi.org/) can be mirrored selectively in custom repositories for use behind a firewall, in internal networks and offline.\r\nCombined with [Crane](https://craneserver.net), authentication and fine-grained authorization (using [OpenID Connect](https://openid.net/developers/how-connect-works/)) can be configured per repository, which offers extra security when dealing with sensitive data or sensitive methodology.\r\n\r\nIn this talk we will walk Python users and developers through different features of RDepot and demonstrate how these can be useful in different scenarios.\r\nThe logic of the different workflows will be explained and live demos will be given to see the open source solution in action.\r\nWe will make sure to address needs ranging from small research groups sharing a handful of packages up to multinational companies managing their Python (and R) code across the globe.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CWWVRN", "name": "Jonas Van Malder", "avatar": null, "biography": null, "public_name": "Jonas Van Malder", "guid": "a3693f92-dc8a-5784-adc2-0c00bf6fe52e", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/CWWVRN/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/YTYRLZ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/YTYRLZ/", "attachments": []}, {"guid": "56128b70-5b28-5f2a-bf60-7e33b9135c3c", "code": "CCRL7W", "id": 79175, "logo": null, "date": "2025-12-10T13:30:00+00:00", "start": "13:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-79175-modernizing-json-for-julia", "url": "https://cfp.pydata.org/pydataglobal2025/talk/CCRL7W/", "title": "Modernizing JSON for Julia", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "JSON support and interfaces vary widely across languages and Julia has been no different. As Julia has evolved as a language, patterns and best practices with regards to interfaces have also evolved with how to best leverage Julia's unique strengths: multiple dispatch, library composability, and zero-cost abstraction. The original JSON.jl package has been rewritten from scratch for a (finally!) 1.0 release bringing JSON support in Julia up to modern best practices and patterns, combining functionality from at least 3(!) existing JSON packages into one unified library offering.", "description": "Over Julia's history, there have been a number of JSON packages providing various forms of JSON support:\r\n* JSON.jl: oldest/original JSON package; very simple JSON support for reading/writing for mostly just core Julia data structures\r\n* LazyJSON.jl: package that attempted to provide \"lazy\" parsing support where JSON could be scanned without fully materializing objects in memory; never quite fully \"finished\" the package/functionality and was thus, never really widely adopted\r\n* JSON2.jl/JSON3.jl: Iterations on interfaces to support custom struct serialization/deserialization in Julia\r\n\r\nThe new 1.0 release to the JSON package combines the functionality from all these packages in a single, unified, *and modern* interface. Package functionality now includes:\r\n* Same basic JSON support of reading/writing for core datastructures\r\n* Support for lazily processing JSON including extracting deeply nested values without intermediate materialization\r\n* A new JSON.Object structure that mimics a `Dict{Symbol, Any}` but preserves insertion (or in this case parse) order, allows dot access, and in most cases is faster with fewer memory allocations than Dict.\r\n* Custom struct serialization/deserialization support that includes specifying field defaults, custom field lower/lift functionality, or directly mutating fields (of mutable structs) while parsing\r\n\r\nThis talk aims to cover the historical context leading to the JSON.jl 1.0 release, how the package leverages clean internal interfaces to provide a ton of functionality without exploding the codebase, and why the decision was made to ultimately rewrite the original JSON.jl package for a 1.0 release instead of yet-another-JSONX.jl type package.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MQ3SJP", "name": "Jacob Quinn", "avatar": "https://cfp.pydata.org/media/avatars/MQ3SJP_Z0yuEym.webp", "biography": "A core contributor to both the Julia language and package ecosystem for over a decade.", "public_name": "Jacob Quinn", "guid": "79bb8667-628f-5d5b-bb2c-aeddb458d581", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/MQ3SJP/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/CCRL7W/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/CCRL7W/", "attachments": []}, {"guid": "f8740a9f-2cb1-5fab-aeb1-79e341e592f4", "code": "ZXVYCB", "id": 78689, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/ZXVYCB/Swiss_Army_Knife_of_MBfXDS5.png", "date": "2025-12-10T14:00:00+00:00", "start": "14:00", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78689-from-ideas-to-apis-delivering-fast-with-modern-python", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ZXVYCB/", "title": "From Ideas to APIs: Delivering Fast with Modern Python", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "The modern Python ecosystem shortens the distance between idea and implementation. This talk presents a focused workflow to move from a business question to a working prototype, fast. We'll explore reproducible environments (uv, Docker), quick data iteration with polars and duckdb, clean project scaffolding (pyproject.toml), and lightweight service layers with FastAPI and pydantic. Along the way, we\u2019ll integrate tests (pytest), static checks (mypy), and fast linting (ruff). You\u2019ll leave with a reusable structure, toolchain recommendations, and a mental model for optimizing feedback loops and development in modern Python projects.", "description": "This talk outlines a practical, opinionated workflow for building real things quickly using modern Python without relying on heavy frameworks or over-engineering.\r\n\r\nCore idea: \r\n\r\nThe shortest path from notebook to usable component is a repeatable, well-lit toolchain with the right structure.\r\n\r\nAttendees will learn how to:\r\n\r\n1. Scaffold a clean project using pyproject.toml, deterministic environments (uv), and lightweight automation (e.g. Makefile or CLI scripts).\r\n\r\n2. Explore data rapidly with polars and duckdb, capturing the business logic in small, testable functions.\r\n\r\n3. Wrap the logic in a minimal FastAPI app with pydantic validation, creating clean contracts and boundaries.\r\n\r\n4. Add fast feedback mechanisms: tests with pytest, type safety via mypy, and low-friction code hygiene using ruff and pre-commit.\r\n\r\n5. Package a handoff-friendly interface (command-line entrypoints, minimal docs) for teammates or deployment pipelines.\r\n\r\nThis talk isn\u2019t a showcase of cutting-edge libraries. It\u2019s a field guide on how to leverage modern Python tools and fostering repeatable software engineering habits to maximize value delivery.\r\n\r\nYou\u2019ll leave with:\r\n\r\n- A blueprint for rapid iteration.\r\n\r\n- Reusable patterns for API-bound prototyping.\r\n\r\n- A mindset that treats reproducibility as a first-class concern.\r\n\r\nPrior knowledge expected:\r\n\r\nBasic Python (functions, environments), familiarity with DataFrame operations, and HTTP/JSON fundamentals.", "recording_license": "", "do_not_record": false, "persons": [{"code": "APLPNP", "name": "C\u00e9sar Soto Valero", "avatar": "https://cfp.pydata.org/media/avatars/APLPNP_fOgnjY2.webp", "biography": "C\u00e9sar is currently a Data Scientist at SEB Group, where he develops AI models to enhance the security of financial transactions on a global scale. He completed an M.Sc. in Machine Learning and moved to Sweden in 2018 to pursue a Ph.D. in Computer Science at KTH Royal Institute of Technology. During his five years at KTH, he pioneered open-source tools and techniques to mitigate software bloat, contributing to the efficiency and security of modern software systems. C\u00e9sar is deeply passionate about AI, science, and technology, with a strong focus on bridging cutting-edge research with real-world applications. He is dedicated to advancing AI\u2019s role in building smarter, more resilient systems that drive innovation.", "public_name": "C\u00e9sar Soto Valero", "guid": "e05271b4-d3ac-54f0-a38d-a3126e797cdf", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/APLPNP/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ZXVYCB/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ZXVYCB/", "attachments": []}, {"guid": "af502a9e-2750-5a03-be7c-0f01dc9b92db", "code": "K38JGZ", "id": 79503, "logo": null, "date": "2025-12-10T14:30:00+00:00", "start": "14:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-79503-quiet-on-set-building-an-on-air-sign-with-open-source-technologies", "url": "https://cfp.pydata.org/pydataglobal2025/talk/K38JGZ/", "title": "Quiet on Set: Building an On-Air Sign with Open Source Technologies", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "While many of us have adapted to work from home life, one major problem remains: finding an easy way to keep folks in your home away from your workspace when you\u2019re on an important call. Dust off your Raspberry Pi\u2013\u2013let\u2019s build a custom on-air sign with Apache Kafka\u00ae, Apache Flink\u00ae, and Apache Iceberg\u2122!\r\n\r\nWe\u2019ll begin by writing Python scripts to capture key events\u2013\u2013such as when a Zoom meeting is running and when a camera is being used\u2013\u2013and produce it into Kafka. The live data are then consumed by a Raspberry Pi script to drive the operation of a custom designed on-air sign. From there, you\u2019ll be introduced to the ins and outs of FlinkSQL for stream processing as we wrangle the data into a better format for downstream use. And, finally, we\u2019ll see Iceberg in action and learn how to use query engines to analyze meeting and recording trends.\r\n\r\nBy the end of the session, you\u2019ll be well-acquainted with this powerful trio of open source technologies and know how you could use the same scaffolding and scale out a simple, at-home project to millions of users and simultaneous events.", "description": "Learn how to build a custom On-Air sign using Apache Kafka\u00ae, Apache Flink\u00ae, and Apache Iceberg\u2122! See how to capture events like Zoom meetings and camera usage with Python, process data with FlinkSQL, analyze trends in your Iceberg tables, and bring it all together with a practical IoT project that easily scales out.", "recording_license": "", "do_not_record": false, "persons": [{"code": "BQKRWQ", "name": "Danica Fine", "avatar": "https://cfp.pydata.org/media/avatars/BQKRWQ_KJfRbhF.webp", "biography": "Danica began her career as a software engineer in financial services and pivoted to developer relations, where she focussed primarily on open source technologies under the Apache Software Foundation umbrella such as Apache Kafka and Apache Flink. She now leads the open source advocacy efforts at Snowflake, supporting Apache Iceberg and Apache Polaris (incubating). She can be found on X (Bluesky and Mastodon), talking about tech, plants, and baking @TheDanicaFine.", "public_name": "Danica Fine", "guid": "6d1075a1-ba4b-5270-abc2-38f3f2b1b25d", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/BQKRWQ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/K38JGZ/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/K38JGZ/", "attachments": []}], "Impact Scholarship Program": [{"guid": "a3309cca-3fc9-548c-9c95-1cad2f113190", "code": "BGK8N8", "id": 86182, "logo": null, "date": "2025-12-10T16:00:00+00:00", "start": "16:00", "duration": "01:00", "room": "Impact Scholarship Program", "slug": "pydataglobal2025-86182-bof-from-data-to-decisions-leveraging-generative-ai-across-the-data-science-workflow", "url": "https://cfp.pydata.org/pydataglobal2025/talk/BGK8N8/", "title": "[BoF] From Data to Decisions: Leveraging Generative AI Across the Data Science Workflow", "subtitle": "", "track": "Impact Scholarship Program", "type": "Talk", "language": "en", "abstract": "Hosted by Inessa Pawson (NumPy Steering Council, OpenTeams Open Source Program)", "description": "This Birds of a Feather session provides an opportunity for a cross-disciplinary dialogue about practical applications, challenges, ethical considerations, and emerging best practices for leveraging generative AI in data science.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8ZMAFB", "name": "Inessa Pawson", "avatar": "https://cfp.pydata.org/media/avatars/8ZMAFB_BDe72nN.webp", "biography": null, "public_name": "Inessa Pawson", "guid": "60246e0d-6a98-5ebf-a39a-16f734737132", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/8ZMAFB/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/BGK8N8/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/BGK8N8/", "attachments": []}]}}, {"index": 3, "date": "2025-12-11", "day_start": "2025-12-11T04:00:00+00:00", "day_end": "2025-12-12T03:59:00+00:00", "rooms": {"General Track": [{"guid": "a913c703-6fb2-5dff-ad91-10706e31c00b", "code": "NSWVT3", "id": 78785, "logo": null, "date": "2025-12-11T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-78785-when-the-meter-maxes-out-chernobyl-disaster-lessons-for-ml-systems-in-production", "url": "https://cfp.pydata.org/pydataglobal2025/talk/NSWVT3/", "title": "When the Meter Maxes Out: Chernobyl Disaster Lessons for ML Systems in Production", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "At 1:23 a.m. on 26 April 1986, the RBMK-4 graphite-moderated reactor at Chernobyl exploded. Every dosimeter still working inside flat-lined at 3.6 R/h, its maximum reading, while lethal radiation raged unseen. That single detail from Chernobyl is the perfect allegory for what can go wrong in modern machine-learning pipelines: clipped features, hidden distribution shifts, missing logs, runaway feedback loops, and more. This talk unpacks key incidents from the disaster and map each one to an equivalent failure mode in production ML, showing how silent risk creeps into data systems and how to engineer for resilience. Attendees will leave with a practical set of questions to ask, signals to track, and cultural habits that keep models (and the businesses that rely on them) well clear of their own meltdowns. No nuclear physics required.", "description": "Software engineers aren\u2019t nuclear engineers, yet the patterns behind catastrophic failure are uncannily transferable. In Chernobyl\u2019s control room, a radiation gauge pinned at 3.6 R/h masked lethal reality; in production we truncate floats, or hide exploding metrics behind poorly chosen histogram bins. Operators overrode the reactor\u2019s emergency cooling \u201cjust for this test\u201d; we disable schema validation to hurry a back-fill. Steam-void reactivity formed a positive feedback loop; recommenders amplify popularity bias until user engagement collapses.\r\n\r\nThe session walks through several such parallels. Each mini-segment starts with the historical context, then immediately pivots into a modern use-case that demonstrates the ML analogue, for instance, an ad-ranking model whose session_depth feature is computed differently online than in training, yielding a negative CTR lift despite glowing offline metrics.\r\nWhile the historical narrative keeps the material memorable, the engineering focus stays firmly on actionable prevention: tools like great expectations, out-of-distribution gates, reproducible datasets, and perhaps most importantly - a culture that treats \u201cimpossible\u201d as a probability, not a certainty.\r\n\r\nNo specialized nuclear knowledge is assumed. Code examples (when present) use familiar PyData stack - NumPy, Pandas, scikit-learn. The use-cases, concepts and tools shown can appeal to both seasoned practitioners and those earlier in their ML journey.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JJJHHZ", "name": "Idan Richman Goshen", "avatar": "https://cfp.pydata.org/media/avatars/JJJHHZ_etK7USw.webp", "biography": "Idan Richman Goshen is a data-driven technologist with an M.A. in Economics and more than a decade of experience turning raw data into business impact. Before leading the Data Science team at Lusha, he built production-grade machine-learning systems at Localize and Dell.", "public_name": "Idan Richman Goshen", "guid": "ef95ffcc-3834-5a63-8e4a-edd895ccd6e2", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/JJJHHZ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/NSWVT3/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/NSWVT3/", "attachments": []}, {"guid": "051f8aa5-1ecd-5957-b37e-757019ef77bf", "code": "PSNG8L", "id": 79453, "logo": null, "date": "2025-12-11T14:00:00+00:00", "start": "14:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-79453-gpu-accelerated-zarr", "url": "https://cfp.pydata.org/pydataglobal2025/talk/PSNG8L/", "title": "GPU Accelerated Zarr", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "The zarr-python 3.0 release includes native support for device buffers, enabling Zarr workloads to run on compute accelerators like NVIDIA GPUs. This enables you to get more work done faster.\r\n\r\nThis talk is primarily intended for people who are at least somewhat familiar with Zarr and are curious about accelerating their n-dimensional array workload with GPUs. That said, we will start with a brief introduction to Zarr and why you might want to consider it as a storage format for the n-dimensional arrays (commonly seen in geospatial, microscopy, or genomics domains, among others). We'll see what factors affect performance and how to maximize throughput for your data analysis or deep learning pipeline. Finally, we'll preview the future improvements to GPU-accelerated Zarr and the packages building on top of it, like xarray and cubed.\r\n\r\nAfter attending this talk, you'll have the knowledge needed to determine if using zarr-python's support for device buffers can help accelerate your workload.", "description": "This talk is targeted at users who have at least heard of zarr, but we will give a brief introduction of the basics. The primary purpose is to spread knowledge about zarr-python\u2019s recently added support for device (GPU) buffers and arrays, and how it can be used to speed up your array-based workload.\r\n\r\nAn outline:\r\n\r\n- Introduction\r\n\r\n  - Brief overview of zarr (cloud-native format for storing chunked, n-dimensional arrays)\r\n  - Brief example of how easy it is to use zarr-python\u2019s native support for device arrays\r\n\r\n- Overview of GPU-accelerated Zarr workloads\r\n\r\n  - We\u2019ll some high-level examples of how Zarr fits into larger workloads (e.g. analyzing climate simulations, as part of a deep learning pipeline)\r\n  - We\u2019ll discuss the key factors to think about when trying to maximize performance\r\n\r\n- Overview of how it works\r\n  - Show zarr\u2019s configuration options for selecting between host and device buffers\r\n  - An overview of the Zarr codec pipeline\r\n  - Show how on-device decompression can be used, to accelerate decompression if that\u2019s a bottleneck in your workload\r\n\r\n- Benchmarks showing the speedup users can expect to see from GPU acceleration\r\n\r\n- Preview of future work\r\n  - Zarr-python currently only uses a single GPU, and doesn\u2019t use any features like CUDA Streams. https://github.com/zarr-developers/zarr-python/issues/3271 tracks possible improvements for exposing additional parallelism.\r\n  - We\u2019ll look at a prototype of how CUDA streams enable asynchronous host-to-device memory copies, enabling you to start computing on one chunk of data while another chunk is being copied to the device.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WWDGWB", "name": "Tom Augspurger", "avatar": "https://cfp.pydata.org/media/avatars/WWDGWB_Le8UvIg.webp", "biography": "I'm a software engineer at NVIDIA working on GPU-accelerated ETL tools as part of the RAPIDS team. I've helped maintain several libraries in the scientific python and geospatial stacks.", "public_name": "Tom Augspurger", "guid": "6163446f-aa03-5314-a32c-ce5eec8511db", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/WWDGWB/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/PSNG8L/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/PSNG8L/", "attachments": []}, {"guid": "68cbe6d8-cd4f-55bc-9868-c599f91ef7fc", "code": "FLD9SR", "id": 85077, "logo": null, "date": "2025-12-11T15:00:00+00:00", "start": "15:00", "duration": "01:00", "room": "General Track", "slug": "pydataglobal2025-85077-keynote-noor-aftab-the-next-commit-building-inclusive-data-driven-ecosystems-for-responsible-ai", "url": "https://cfp.pydata.org/pydataglobal2025/talk/FLD9SR/", "title": "Keynote- Noor Aftab- The Next Commit: Building Inclusive, Data-Driven Ecosystems for Responsible AI", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Python is the number one language on GitHub, yet a git log of our shared future reveals a critical system failure. While Python adoption is exponential, women author only 2\u20133% of core-repository commits and comprise just 22% of the global AI talent pool. This talk moves beyond rhetoric to present a 5-Step Engineering Framework based on quantitative research from 24 global tech communities. We will introduce the VIM Model (Visibility, Invitation, Mechanism)\u2014a proven architecture that drove 179% membership growth and 99% retention in pilot programs.", "description": "Python powers the global AI ecosystem, yet 78% of the talent pool is missing.\r\n\r\nWe are building the most advanced systems in history with a critical \"innovation debt.\" This is evidenced not just by the gender gap, but by biased algorithms and higher error rates in production models. This talk treats this gap as an engineering crisis and provides a research-backed solution.\r\n\r\nDrawing on published work from the SciPy Proceedings and a quantitative study of 24 global tech communities, we will introduce the 5-Step Engineering Framework. We will deconstruct the VIM Model (Visibility, Invitation, Mechanism), which drove 179% membership growth in the IBM Women in AI pilot.\r\n\r\nAttendees will walk away with three actionable tools:\r\n\r\n1) The System Audit: A method to measure \"innovation debt\" in your own teams using specific retention and contribution metrics.\r\n2) The VIM Patch: A blueprint for deploying high-yield mechanisms\u2014such as hands-on Python labs (requested by 76% of members)\u2014that statistically outperform generic networking.\r\n3) The Retention Fix: A step-by-step guide to stabilizing the \"leaky pipeline,\" specifically addressing the mid-career drop-off point where 50% of diverse talent currently leaves.\r\n\r\nThis session is for builders and maintainers ready to stop admiring the problem and commit to the fix.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HFCNVU", "name": "Noor Aftab", "avatar": "https://cfp.pydata.org/media/avatars/HFCNVU_ASun1rk.webp", "biography": "Noor Aftab is the Global Program Lead at Amazon Web Services (AWS), where she drives strategic programs for Amazon S3, supporting some of the world\u2019s most complex data, AI, and analytics workloads. With a foundation in software engineering and data science, she brings over a decade of experience building and scaling cloud-native solutions, AI/ML systems, and developer-focused programs.\r\n\r\nShe serves as Vice President of the Society of Women Engineers (SWE) Pacific Northwest section, championing technical leadership and mentoring initiatives across engineering communities. Noor is also Chair of the NumFOCUS Code of Conduct Working Group and User Group Leader for IBM Women in AI, where she fosters inclusive, resilient communities across 300+ open-source projects.\r\n\r\nA frequent keynote speaker, Noor has presented at PyData Global, SciPy, ODSC, TEDx, IEEE, and 13+ global venues, delivering talks that connect technical depth with real-world adoption of AI and cloud. She has authored and led initiatives such as the IEEE Hour of Power AI training program, empowering engineers and professionals with practical AI skills.\r\n\r\nHer contributions to technology and leadership have been recognized with awards, including the Australia Alumni Excellence Award and Asia Pacific HRM Congress Award, with media features in the BBC, Martha\u2019s Vineyard Times, and Hindustan Times.\r\n\r\nGitHub: aftabn81\r\n| Website: www.nooraftab.com", "public_name": "Noor Aftab", "guid": "4339b178-a36e-542a-a8ee-7478abfa8d2e", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/HFCNVU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/FLD9SR/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/FLD9SR/", "attachments": []}, {"guid": "c5ef2240-5dba-5b37-85ff-8e32ba05fa66", "code": "SBM8ZY", "id": 77389, "logo": null, "date": "2025-12-11T16:00:00+00:00", "start": "16:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-77389-garbage-in-lawsuit-out-building-compliant-and-reproducible-ml-pipelines", "url": "https://cfp.pydata.org/pydataglobal2025/talk/SBM8ZY/", "title": "Garbage In, Lawsuit Out: Building Compliant and Reproducible ML Pipelines", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Your model might pass all the benchmarks\u2014but can it survive a subpoena? In the race to ship AI, most teams are building workflows that look great in dashboards but fall apart under legal, regulatory, or ethical pressure. Because the real liability doesn\u2019t live in your model weights\u2014it\u2019s buried in your data.", "description": "This session is a reality check for anyone shipping machine learning in production. We\u2019ll walk through the dark corners of modern ML pipelines: mutable datasets with no history, mystery data sources with missing labels, and a forgotten column of PII that\u2019s just been shipped to production. Then we\u2019ll show how to fix it\u2014without turning your data team into compliance officers. \r\n\r\nYou\u2019ll learn how to embed reproducibility, traceability, and policy enforcement into your pipeline without slowing it to a crawl: track every dataset change, version every experiment, validate against policy gates, and generate audit trails that actually mean something. Whether you\u2019re dealing with GDPR, HIPAA, or just not wanting to get roasted by internal audit, this talk gives you the blueprint for ML you can defend in court\u2014and still ship on time.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZLVDPU", "name": "Itai Gilo", "avatar": "https://cfp.pydata.org/media/avatars/ZLVDPU_kDVN31l.webp", "biography": "Itai is a seasoned software engineer, passionate about clean code and design, and about simplifying what is complex. Doing what\u2019s needed, whether it\u2019s backend, full-stack, or mobile development, and enjoys creating well-crafted products.", "public_name": "Itai Gilo", "guid": "2f822547-d8ea-572c-8591-65c03137289e", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/ZLVDPU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/SBM8ZY/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/SBM8ZY/", "attachments": []}, {"guid": "c8e727f1-b32e-5bbc-b9b3-cc09fa047929", "code": "SSVDUG", "id": 79433, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/SSVDUG/role-user-group_GfdEY4N.png", "date": "2025-12-11T17:00:00+00:00", "start": "17:00", "duration": "00:30", "room": "General Track", "slug": "pydataglobal2025-79433-connected-identities-rethinking-identity-and-access-management-with-neo4j-and-python", "url": "https://cfp.pydata.org/pydataglobal2025/talk/SSVDUG/", "title": "Connected Identities: Rethinking Identity and Access Management with Neo4j and Python", "subtitle": "", "track": "General Track", "type": "Talk", "language": "en", "abstract": "Access control is ultimately about relationships\u2014between people, systems, and resources. In this talk, we\u2019ll look at how modeling connected identities with a graph database unlocks a more efficient and transparent way to manage Identity and Access Management (IAM).\r\n\r\nUsing Neo4j and Python, we\u2019ll walk through a practical approach to building an IAM system that prioritizes clarity, performance, and portability. You\u2019ll learn how to model users, roles, and permissions as a connected graph, write access logic in Cypher, and deploy a lightweight system that scales without adding complexity.\r\n\r\nIn this fast-paced talk, you\u2019ll learn how to :\r\n\r\n- Map users, roles, and permissions like a detective\r\n\r\n- Write smart queries to control access\r\n\r\n- Build a lightweight, graph-powered IAM engine\r\n\r\nNo graph skills? No problem. Just bring Python and curiosity.", "description": "Access control: it sounds boring\u2014until it breaks. In this talk, we\u2019ll look at how to build a smarter Identity and Access Management (IAM) system using Neo4j and Python, and why graphs are a game-changer for modeling who can do what.\r\n\r\nYou\u2019ll get a crash course in graph-based thinking for IAM, see how to represent users, roles, and permissions as connected data, and learn how a few Cypher queries can uncover misconfigurations, rogue access, and hidden connections\u2014all in real time.\r\n\r\nAs systems scale and architectures grow more distributed, Identity and Access Management (IAM) often becomes a heavy, costly layer\u2014difficult to maintain, expensive to scale, and slow to adapt. But it doesn\u2019t have to be this way.\r\n\r\nThis talk introduces an approach to IAM that is lightweight, portable, and cost-efficient, using Neo4j and Python. By leveraging the natural connectedness of identity data\u2014users, roles, permissions, and resources\u2014we can model access in a way that\u2019s easy to manage, fast to query, and flexible to deploy.\r\n\r\nAttendees will learn how to build a graph-based IAM system that avoids complex cloud dependencies, offers real-time access insights, and supports role- and attribute-based access control without requiring massive infrastructure. Whether you're managing internal tools, building developer platforms, or scaling services, this approach provides strong access control without unnecessary overhead.", "recording_license": "", "do_not_record": false, "persons": [{"code": "QYUK8E", "name": "Irina Loghin", "avatar": "https://cfp.pydata.org/media/avatars/QYUK8E_tNzwFWj.webp", "biography": "Irina Loghin is a Technical Curriculum Developer at Neo4j Identity and Access Management (IAM) expert. With a background in security architecture and developer education, she specializes in making complex IAM concepts accessible through graph-based thinking and practical solutions.\r\n\r\nAt Neo4j, Irina designs technical learning programs that help engineers and architects rethink identity through connected data models.\r\n\r\nShe is passionate about building clear mental models for modern IAM, and advocates for approaches that prioritize portability, visibility, and developer autonomy.", "public_name": "Irina Loghin", "guid": "6d176025-d6f8-56b6-ad00-b3d4e0da6f4a", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/QYUK8E/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/SSVDUG/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/SSVDUG/", "attachments": []}], "Machine Learning & AI": [{"guid": "b99f3428-1afb-529e-922d-c3befd589f04", "code": "AJD8TU", "id": 78720, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/AJD8TU/PyData_Presentation_WfU6jwu.png", "date": "2025-12-11T11:30:00+00:00", "start": "11:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78720-revolutionizing-safety-log-analysis-in-oil-and-gas-a-multi-stage-llm-approach-for-enhanced-hazard-identification", "url": "https://cfp.pydata.org/pydataglobal2025/talk/AJD8TU/", "title": "Revolutionizing Safety Log Analysis in Oil and Gas: A Multi-Stage LLM Approach for Enhanced Hazard Identification", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "In this presentation, we demonstrate how Large Language Models (LLMs) can revolutionize safety log analysis in the oil and gas industry. Our research with a major operator involved processing 15,000 safety observations through a novel multi-stage pipeline. First, we developed a domain-specific categorical framework aligned with industry standards. We then implemented an unsupervised learning approach using sentence transformers to calculate semantic similarity between observations and predefined categories. This enabled multi-dimensional classification with weighted confidence percentages. Finally, we deployed a fine-tuned LLM to assign priority scores and enhance categorization accuracy, all while maintaining data privacy through on-premises processing. The resulting system streamlines real-time safety log processing, enabling more efficient identification of potential hazards and trends. Our implementation demonstrates significant improvements in classification accuracy and processing efficiency compared to traditional methods, providing actionable insights for proactive safety management.", "description": "This presentation explores a new application of Large Language Models (LLMs) in the oil and gas industry, specifically for safety log analysis. While oil and gas operators have traditionally been cautious in adopting LLM technologies, this project demonstrates a compelling use case that delivers tangible value through enhanced hazard identification and trend analysis. Attendees will learn how our multi-stage LLM pipeline processes safety observations to generate actionable insights while maintaining data privacy through on-premises processing. The presentation will showcase how this approach significantly improves classification accuracy and processing efficiency compared to traditional methods, providing a practical framework for organizations looking to leverage AI for safety management.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TNTN3S", "name": "Andrew Yule", "avatar": "https://cfp.pydata.org/media/avatars/TNTN3S_iF5Tjdd.webp", "biography": "Andrew Yule is a co-founder and managing partner of Pontem Analytics, a global consulting company in the energy industry specializing in combining domain expertise with data-driven solutions. Andrew has 14 years of experience in the energy industry, where he has contributed to a diverse range of projects spanning both offshore and onshore. He has been a member of SPE since he began his career in 2011 and is currently a contributor for SPE\u2019s The Way Ahead magazine as well as a chairman on the Fort Worth SPE board. He is also a member of the Young Entrepreneurial Council. His technical background includes a bachelor\u2019s degree in Chemical Engineering from the Colorado School of Mines and a master\u2019s degree in data science from Southern Methodist University.", "public_name": "Andrew Yule", "guid": "2ce4ff90-11e2-5a1a-aeb0-93bafc4f54d2", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/TNTN3S/"}, {"code": "NSFATL", "name": "Iain Docherty", "avatar": "https://cfp.pydata.org/media/avatars/NSFATL_oK6qJW0.webp", "biography": "Iain Docherty is a Chemical Engineer with over 10 years of experience across nuclear, energy, mining, and renewables sectors. He is currently a Lead Engineer at Pontem Analytics, specializing in combining first-principles modelling with data-driven approaches to optimise processes. Proven experience in developing and deploying control and optimization solutions leveraging deep reinforcement learning and machine learning techniques.", "public_name": "Iain Docherty", "guid": "3c7ae59a-e7bc-5a35-b53c-0bcb332d84aa", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/NSFATL/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/AJD8TU/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/AJD8TU/", "attachments": []}, {"guid": "d24ab89a-d3c2-5fb5-92fc-80d288c27fab", "code": "ATM79G", "id": 77504, "logo": null, "date": "2025-12-11T12:00:00+00:00", "start": "12:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-77504-how-big-are-slms", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ATM79G/", "title": "How Big are SLMs", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Small Language Models (SLMs) are designed to deliver high performance with significantly fewer parameters compared to Large Language Models (LLMs). Typically, SLMs range from 100 million to 30 billion parameters, enabling them to operate efficiently on devices with limited computational resources, such as smartphones and embedded systems", "description": "The development of SLMs addresses the growing demand for AI solutions that are cost-effective, energy-efficient, and capable of running locally to ensure data privacy and reduce latency. Recent advancements have demonstrated that SLMs can rival or even surpass larger models in specific tasks, thanks to optimized architectures and training methodologies .\u200b\r\nA notable example is Google's Gemma 3, a multimodal SLM family with models ranging from 1 to 27 billion parameters. Gemma 3 introduces vision understanding capabilities, supports longer context windows of at least 128K tokens, and employs architectural changes to reduce memory usage . The 27B parameter version of Gemma 3 has achieved competitive performance, ranking among the top 10 models in the LMSys Chatbot Arena with an Elo score of 1339 .\r\nThe shift towards SLMs signifies a paradigm change in AI development, focusing on creating models that are not only powerful but also accessible and adaptable to a wide range of applications. As the field evolves, SLMs are poised to play a crucial role in democratizing AI technology.\u200b", "recording_license": "", "do_not_record": false, "persons": [{"code": "J3X3FG", "name": "Jayita Bhattacharyya", "avatar": "https://cfp.pydata.org/media/avatars/J3X3FG_BDquhVh.webp", "biography": "AI ML Nerd with a blend of technical speaking & hackathon wizardry! Applying tech to solve real-world problems. The work focus these days is on generative AI. Helping software teams incorporate AI into transforming software engineering.", "public_name": "Jayita Bhattacharyya", "guid": "5b70d539-f571-5682-8792-d301fdfe5beb", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/J3X3FG/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ATM79G/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ATM79G/", "attachments": []}, {"guid": "c3291f6f-233a-50a5-8a7a-9f5a6f508618", "code": "ECCYVF", "id": 78505, "logo": null, "date": "2025-12-11T13:00:00+00:00", "start": "13:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78505-automating-ml-with-pycaret-train-compare-multiple-models-to-find-the-best-performer", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ECCYVF/", "title": "Automating ML with PyCaret: Train & Compare Multiple Models to Find the Best Performer", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "This Live demonstration shows how PyCaret, an open-source low-code machine learning library, can dramatically simplify model training and comparison workflows. PyCaret is democratizing machine learning by empowering anyone to train multiple algorithms and compare their performance with minimal code. Attendees will witness live demonstrations of training various ML algorithms and using automated comparison techniques to select the best performer based on key metrics. Perfect for data scientists, developers, and ML enthusiasts looking to spend less time coding and more time on model analysis and selection.", "description": "Machine learning workflows often involve repetitive tasks, complex code, and time-consuming model comparisons. PyCaret changes this paradigm by democratizing machine learning - empowering anyone to train multiple algorithms and systematically compare their performance with low-code solutions. With PyCaret's philosophy of \"spend less time coding and more time on analysis,\" this library transforms the model selection process by automating training and comparison across multiple algorithms.\r\nIn this 30-minute session, you'll discover:\r\n\r\nML and PyCaret Fundamentals (13 mins)\r\n\r\n1. What is Machine Learning, Machine Learning Algorithms and workflows\r\n2. What is PyCaret\r\n\r\n Live Demo: Multi-Algorithm Training & Comparison (10 mins)\r\n\r\n1. Hands-on demonstration using the Diabetes Dataset\r\n2. Training multiple algorithms simultaneously with minimal code\r\n3. Automated model comparison using various performance metrics\r\n4. Real-time exploration of model performance visualizations\r\n5. Selecting the best performer based on key evaluation metrics\r\n\r\n\r\n Wrap-up & Resources (2 mins)\r\n\r\n1. Key takeaways and next steps\r\n2. Access to GitHub repository with slides and demo notebooks\r\n\r\nQ&A (5 min)\r\n\r\nWho Should Attend:\r\n\r\n1. Data scientists looking to accelerate their workflow\r\n2. Python developers interested in machine learning\r\n3. ML practitioners seeking efficient model prototyping tools\r\n4. Anyone curious about low-code ML solutions\r\n\r\nPrerequisites:\r\n\r\n1. Basic understanding of Python\r\n2. Familiarity with machine learning concepts (helpful but not required)\r\n3. No prior PyCaret experience needed\r\n\r\nWhat You'll Take Away:\r\n\r\n1. Practical knowledge of automated model training and comparison\r\n2. Experience with systematic algorithm evaluation using PyCaret\r\n3. Understanding of performance metrics for model selection\r\n4. Ready-to-use code examples for multi-algorithm comparison\r\n5. Confidence to choose the best ML algorithm for your specific projects\r\n\r\nJoin us for this fast-paced, demo-heavy session that will transform how you approach machine learning projects!", "recording_license": "", "do_not_record": false, "persons": [{"code": "8WBQVJ", "name": "Manjunath Janardhan", "avatar": "https://cfp.pydata.org/media/avatars/8WBQVJ_67LDfgQ.webp", "biography": "I am a Principal AI Engineer with over two decades of experience transforming complex business challenges through innovative AI solutions. My career is defined by delivering measurable impact, including a patented Intelligent Service Platform that achieved an 80% reduction in operational costs.\r\nCurrently at MSG Global Solutions, I lead AI development initiatives for SAP Enterprise applications, with a primary focus on SAP Profitability and Performance Management (PaPM). My work involves architecting and implementing enterprise-scale Generative AI solutions for the PaPM Universal Model, where I integrate vector databases with SAP HANA to significantly enhance information retrieval capabilities.\r\n\r\nMy previous role at GE Healthcare demonstrated my ability to scale AI solutions globally, where I built on-premises Generative AI systems that boosted developer productivity by 40% across international teams. I specialize in combining open-source Large Language Models with Hybrid-RAG and Agentic techniques, leveraging cloud-native architectures across AWS, Azure, and GCP platforms. My portfolio includes high-impact tools such as MICT GPT, CODE GPT, and Service GPT, with Aspire CODE GPT notably reducing development time for the Aspire CT Product by 30%.\r\n\r\nMy technical foundation encompasses the complete software development lifecycle, from modernizing monolithic systems to microservices using Java and C++, to containerizing applications with Docker and Kubernetes. I maintain active contributions to open-source NLP projects, reflecting my commitment to advancing the broader AI community.\r\n\r\nProfessional development remains central to my practice. I regularly engage with the AI community through conferences, workshops, webinars, and hackathons, recently developing a working prototype for a Socratic DSA Tutor. As an industry speaker, Medium blogger, and content creator, I share practical insights on AI implementation strategies and emerging technologies, focusing on mentoring the next generation of AI engineers while driving innovation in enterprise AI applications.", "public_name": "Manjunath Janardhan", "guid": "8bc98598-1623-5972-88fd-c76eb7ccf935", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/8WBQVJ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ECCYVF/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ECCYVF/", "attachments": []}, {"guid": "40ce2da2-7747-52fb-a7cc-1d55d138e4ae", "code": "7MEX7V", "id": 78603, "logo": null, "date": "2025-12-11T13:30:00+00:00", "start": "13:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78603-streaming-ai-workflows-in-python-kafka-queues-and-flink-powered-llm-inference", "url": "https://cfp.pydata.org/pydataglobal2025/talk/7MEX7V/", "title": "Streaming AI Workflows in Python: Kafka Queues and Flink-Powered LLM Inference", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Python users working on real-time analytics\u2014from payment processing and fraud detection to AI-driven support\u2014rely on message queues to keep data moving reliably and efficiently. Traditional message queues, however, can struggle with large-scale, concurrent workloads, especially when you need durability and replayability.\r\n\r\nIn this session, we\u2019ll show how Kafka 4.0 introduces robust queue semantics to distributed streaming, empowering Python applications to handle fair, concurrent, and isolated message processing at scale\u2014using familiar Kafka Python clients and frameworks.\r\n\r\nBut the power lies in what you can build next. We\u2019ll demonstrate how Apache Flink can connect Kafka event streams to real-time Large Language Model (LLM) inference for tasks like sentiment analysis and summarization, all orchestrated via Python APIs and remote model endpoints for powerful, flexible AI inference.\r\n\r\nTo complete the picture, we\u2019ll cover how enriched results can be stored in popular data lake solutions\u2014such as Apache Iceberg\u2014enabling long-term analytics, time travel, and integration with downstream data science workflows. Support for Iceberg and other lakehouse formats is optional, giving you flexibility to choose the right data backend for your needs.", "description": "This talk includes:\r\n\r\nLive Python-oriented demo and architecture walkthrough.\r\n\r\nBuilding an end-to-end pipeline: Kafka queue \u2192 Flink+LLM inference \u2192 (optional) Data lake storage (e.g., Iceberg).\r\n\r\nPython code samples, best practices, and design patterns for powering real-time, intelligent analytics on modern cloud-native stacks.\r\n\r\nWhether you\u2019re developing in Jupyter Notebooks, Pandas, or PySpark, you\u2019ll discover practical ways to combine Kafka, Flink, and LLMs in your Python data workflows\u2014with or without a lakehouse backend.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JWQCRC", "name": "Shekhar Prasad Rajak", "avatar": "https://cfp.pydata.org/media/avatars/JWQCRC_dnzgfQc.webp", "biography": "Passionate Open Source Advocate and Software Engineer at Apple.\r\nShekhar is a seasoned open-source developer and advocate, with contributions to SymPy, NumPy, SciPy, Bundler, and as the author of daru and daru-view in the SciRuby ecosystem. A two-time GSoC alumnus (2016,17) and former SciRuby org admin, he has mentored across multiple open-source communities. He has spoken at leading conferences, including RubyConf, PyCon, ApacheCon, and Community Over Code. Currently, he is a Software Development Engineer at Apple, driving innovation in software engineering.", "public_name": "Shekhar Prasad Rajak", "guid": "777cae0e-ccea-572a-9281-284d7d98d67c", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/JWQCRC/"}, {"code": "V7BZSH", "name": "bhrathjatoth", "avatar": "https://cfp.pydata.org/media/avatars/V7BZSH_xdrHVzx.webp", "biography": "Senior AI Engineer with over eight years of experience architecting scalable machine learning, generative AI, and LLM solutions. Holding a B.Tech from IIT Guwahati, he specializes in RAG, LangChain, PyTorch, and AWS, delivering innovations like a fact-checking system for Cyara, I led risk quantification and hallmarking software projects, boosting exports by 8\u20139% CAGR. Recognized at CGI\u2019s Global Meet 2018, Bharath drives transformative AI solutions with Docker, Kubernetes, and cloud pipelines, blending technical expertise with impactful leadership.", "public_name": "bhrathjatoth", "guid": "da0bb523-7808-5cba-8df8-6f97a79f8186", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/V7BZSH/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/7MEX7V/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/7MEX7V/", "attachments": []}, {"guid": "0d5827ae-7a0e-5a7e-b38a-1c241a67d181", "code": "SPFEYP", "id": 78498, "logo": null, "date": "2025-12-11T14:00:00+00:00", "start": "14:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78498-from-handwritten-notes-to-smart-knowledge-build-local-ai-agents-with-python", "url": "https://cfp.pydata.org/pydataglobal2025/talk/SPFEYP/", "title": "From Handwritten Notes to Smart Knowledge: Build Local AI Agents with Python", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Your notebooks are full of insights\u2014but they\u2019re scattered and hard to search.\r\nIn this live-coding session I\u2019ll show how to turn handwritten notes into a searchable, connected knowledge base using local AI and minimal Python.\r\n\r\nWe start with AnythingLLM\u2019s UI for quick wins, then move to Python agents that:\r\n\u2022 classify note types,\r\n\u2022 extract key ideas,\r\n\u2022 build a personal knowledge graph.\r\n\r\nThe entire stack runs on your laptop with MLC-AI\u2014no cloud, no data leaks.\r\nYou\u2019ll leave with a reusable agent blueprint you can drop into any data-processing workflow tomorrow.", "description": "What you\u2019ll learn\r\n\u2022 When to stay in a UI vs. when Python is essential\r\n\u2022 How to orchestrate agents with CrewAI and plug in custom logic\r\n\u2022 Clean patterns for local LLM inference with MLC-AI\r\n\u2022 A complete, copy-paste-ready pipeline for knowledge extraction & linking\r\n\r\nLive demos\r\n\r\nAnythingLLM quick-start (2 min)\r\nPython agent orchestration classifying & linking 10+ handwritten notes (15 min)\r\nQuerying the resulting knowledge graph for recurring themes (3 min)\r\nTake-home repo\r\nGitHub repo + requirements.txt + Docker compose file so attendees can rerun everything on their own notes.\r\n\r\nPrerequisites\r\nBasic Python (functions, classes, pip install). No prior AI/ML knowledge required.", "recording_license": "", "do_not_record": false, "persons": [{"code": "8WXHJU", "name": "piotr stepinski", "avatar": "https://cfp.pydata.org/media/avatars/8WXHJU_7JwnfJT.webp", "biography": "Data Science Leader with extensive experience in AI and MLOps, currently serving as the CTO at Infinitii AI. He has a strong background in team leadership, product innovation, and building scalable data-driven solutions. Piotr is passionate about using AI to solve real-world problems, particularly in time-series analysis. He is an advocate for Agile methodologies and MLOps practices, and has spoken at conferences about these topics.", "public_name": "piotr stepinski", "guid": "dcf87914-305b-5146-9f40-26972892d367", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/8WXHJU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/SPFEYP/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/SPFEYP/", "attachments": []}, {"guid": "409b484b-ebf3-58f2-8b70-def42abbefc0", "code": "RQSLXN", "id": 77515, "logo": null, "date": "2025-12-11T14:30:00+00:00", "start": "14:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-77515-detecting-regime-shifts-in-time-series-with-python-entropy-based-change-point-detection", "url": "https://cfp.pydata.org/pydataglobal2025/talk/RQSLXN/", "title": "Detecting Regime Shifts in Time Series with Python: Entropy-Based Change-Point Detection", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Financial and other real-world time series often experience abrupt regime changes that can break assumptions and invalidate models. This talk shows how to use k-nearest neighbor entropy estimators combined with clustering algorithms, implemented entirely in Python, to detect these change-points early. We\u2019ll explore practical examples with financial market data, discuss strengths and limitations, and provide reusable open-source code. Attendees will leave with tools to make their time series models more robust to sudden structural changes.", "description": "Time series data in finance, IoT, or sensor monitoring are rarely stationary \u2014 regime shifts happen suddenly, and failing to detect them early can lead to inaccurate predictions or large financial losses.\r\n\r\nThis talk presents a practical, Python-based approach to change-point detection in multivariate time series using k-nearest neighbor entropy estimators combined with clustering techniques. This method uses open-source libraries like NumPy, scikit-learn, and pandas, and can be adapted to various domains.\r\n\r\nTakeaways:\r\n\r\n- How to implement entropy-based change-point detection with open-source Python tools.\r\n\r\n- How to identify and handle abrupt shifts in time series to make models more robust.\r\n\r\n- How to apply these techniques beyond finance to any time series with regime shifts.", "recording_license": "", "do_not_record": false, "persons": [{"code": "FBCVY8", "name": "Sergei Nasibian", "avatar": "https://cfp.pydata.org/media/avatars/FBCVY8_IMgGo9G.webp", "biography": "Sergei Nasibian is a Quantitative Strategist at Rothesay in London, where he designs and implements systematic trading and risk management models. He previously worked as a Data Scientist at McKinsey & Company and as a Senior Analyst at Yandex Eats, developing data-driven strategies across diverse domains. Sergei holds a degree with honors in Mathematics from Lomonosov Moscow State University, specializing in probability theory and stochastic processes. His research experience includes entropy-based change-point detection methods developed during a collaboration with Ulm University. Sergei is passionate about translating advanced mathematical concepts into practical, production-ready tools using open-source Python libraries, and he enjoys exploring intersections between machine learning, statistical modeling, and financial markets.", "public_name": "Sergei Nasibian", "guid": "80409f35-f171-53af-91ed-fc4cf6a61c24", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/FBCVY8/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/RQSLXN/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/RQSLXN/", "attachments": []}, {"guid": "d2317a0e-8f82-543f-9cd4-df1cccfb5550", "code": "DYXWAV", "id": 79526, "logo": null, "date": "2025-12-11T16:00:00+00:00", "start": "16:00", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79526-future-proof-your-ai-product", "url": "https://cfp.pydata.org/pydataglobal2025/talk/DYXWAV/", "title": "Future proof your AI product", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "In this talk I will cover frequent AI system problems caused by using prompts and opaque frameworks instead of a descriptive programmatic approach, using DSPy.", "description": "Most LLM frameworks are too opaque and obscure what they are doing. New state of the art models are released every week and different models respond differently to the same prompts. These framework's hardcoded prompts within the library make it difficult to debug, update and improve the system. Also, walls of text are a terrible way to program, and hardly maintainable. DSPy is a better way, using abstractions to code your intent into the LLM without defining the prompt, making it future proof. Changing one line, you can change models, tasks or inference strategy.", "recording_license": "", "do_not_record": false, "persons": [{"code": "Q3D9GR", "name": "Breno Brito", "avatar": "https://cfp.pydata.org/media/avatars/Q3D9GR_EdCGkCp.webp", "biography": "ML engineer, Data Scientist and author with over a decade in total experience, specially in Finance and Bitcoin industries. I translated several books from English to Portuguese, won prizes in several hackathons with LLM solutions and have been interviewed in dozens of podcasts and newspapers.", "public_name": "Breno Brito", "guid": "ec834da8-05e2-5bea-be7e-4e99f3cd70ab", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/Q3D9GR/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/DYXWAV/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/DYXWAV/", "attachments": []}, {"guid": "23184d27-304e-5f1f-8464-416c542de94e", "code": "EJJSKK", "id": 78765, "logo": null, "date": "2025-12-11T16:30:00+00:00", "start": "16:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-78765-hpc-implementation-of-a-hybrid-recommender-system-in-julia", "url": "https://cfp.pydata.org/pydataglobal2025/talk/EJJSKK/", "title": "HPC Implementation of a Hybrid Recommender System in Julia", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "This talk discusses a hybrid recommender system implemented in Julia for preselecting job applicants. The recommender system is built using a neural network adopting a hybrid architecture that combines convolutional layers of a graph neural network and a transformer (both encoder and decoder). We discuss the preprocessing of applicant metadata and job adverts to generate a heterogeneous graph. Next, we present the recommender as a model and its training using an HPC.", "description": "In this talk, we present the implementation of a hybrid recommender system that helps preselect candidates for a job application. We discuss the preprocessing of the data following NLP techniques and building on various libraries, including TextAnalysis, Embeddings and MLJ. The input information (applicant metadata and job adverts) is aggregated into a heterogeneous graph, later converted into a GNN using GraphNeuralNetworks. The underlying model supporting the recommendations combines several graph convolutional layers and a transformer (encoder and decoder). To make the model's training more efficient, we rely on the Distributed and ClusterManagers libraries. Note that our preprocessing and training steps are implemented using a supercomputer. We present the implementation and the job submission details.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DQJUKB", "name": "Jos\u00e9 Quenum", "avatar": "https://cfp.pydata.org/media/avatars/DQJUKB_5y9Zan7.webp", "biography": "Jos\u00e9 Quenum is a Researcher at the Namibia University of Science and Technology (NUST). His interests include Distributed Systems, Artificial Intelligence and Big Data.", "public_name": "Jos\u00e9 Quenum", "guid": "dc7dde8b-4745-5e5e-882a-2bf4559417e6", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/DQJUKB/"}, {"code": "FDR77P", "name": "marthin thomas", "avatar": null, "biography": null, "public_name": "marthin thomas", "guid": "046b5eb4-c3c5-5bef-8d7a-1520f4aee6b0", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/FDR77P/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/EJJSKK/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/EJJSKK/", "attachments": []}, {"guid": "65f6b81c-ea6a-5ed3-95ca-ba0f8b484082", "code": "7PTYQX", "id": 79471, "logo": null, "date": "2025-12-11T18:30:00+00:00", "start": "18:30", "duration": "00:30", "room": "Machine Learning & AI", "slug": "pydataglobal2025-79471-tinytroupe-enhancing-marketing-insights-through-llm-powered-multiagent-persona-simulation", "url": "https://cfp.pydata.org/pydataglobal2025/talk/7PTYQX/", "title": "TinyTroupe: Enhancing Marketing Insights through LLM-Powered Multiagent Persona Simulation", "subtitle": "", "track": "Machine Learning & AI", "type": "Talk", "language": "en", "abstract": "Understanding customer behavior is essential in marketing. Traditionally, marketers rely on methods such as surveys, customer interviews, and focus groups to gather insights. However, these approaches can be expensive, time-consuming, and limited in scale and diversity.\r\nRecently, multi-agent simulation powered by Large Language Models (LLMs) is emerging as an innovative technique.  TinyTroupe, for example, enables the creation of different personas (e.g., budget\u2011minded Gen\u2011Z shoppers, premium\u2011seeking parents), allowing marketers to predict and optimize advertising effectiveness or replace time-consuming interviews rapidly.\r\nIn this talk, I will introduce the key concepts of LLM-powered multi-agent simulations, demonstrate their practical application in marketing through TinyTroupe, and share actionable insights and recommendations.", "description": "**Agenda:**\r\n\r\n- Introduction\r\n- Business Context: customer understanding & traditional research\r\n- The Challenge: \u201cCan\u2019t we just use ChatGPT?\u201d\r\n- TinyTroupe: LLM-powered multi-agent persona simulation\r\n- Code Walkthrough: end-to-end concept-test demo (running-shoe example)\r\n- Summary & practical tips\r\n\r\n**Code Walkthrough Part**\r\nhttps://github.com/takechanman1228/Effective-Persona-Simulation\r\n\r\n**Key Takeaways:**\r\n- Understand the core concepts and advantages of LLM-powered multi-agent persona simulation.\r\n- Learn how to leverage TinyTroupe for efficient and insightful marketing analytics.\r\n\r\n**Target Audience:**\r\n- Data analysts and data scientists interested in customer analytics and marketing.\r\n- Marketers, business analysts, and executives seeking innovative approaches to understanding customer behavior and optimizing marketing strategies.\r\n- IT specialists and developers interested in applying LLM and multi-agent simulation technologies to real-world business scenarios.", "recording_license": "", "do_not_record": false, "persons": [{"code": "7DWP9K", "name": "Hajime Takeda", "avatar": "https://cfp.pydata.org/media/avatars/7DWP9K_wvO9HMT.webp", "biography": "Hajime is a data professional with 8+ years of expertise in marketing, retail, and eCommerce, working in New York.", "public_name": "Hajime Takeda", "guid": "25dd241d-1f36-5efd-bc62-097c4d96e86a", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/7DWP9K/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/7PTYQX/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/7PTYQX/", "attachments": []}], "Analytics, Visualization & Decision Science": [{"guid": "0cd5ca7e-a3b4-5004-9899-5a78ac859670", "code": "WRSZRV", "id": 77401, "logo": null, "date": "2025-12-11T13:30:00+00:00", "start": "13:30", "duration": "01:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-77401-computer-vision-data-version-control-and-reproducibility-at-scale", "url": "https://cfp.pydata.org/pydataglobal2025/talk/WRSZRV/", "title": "Computer Vision Data Version Control and Reproducibility at Scale", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Tutorial", "language": "en", "abstract": "Computer vision, the field focused on enabling machines to interpret and understand visual data, tackles challenges like image recognition, object detection, and scene understanding. PyData tools play a critical role in solving these issues by offering robust libraries like TensorFlow, PyTorch, Keras, and Langchain for building and training machine learning models, performing image processing, and managing large datasets. This hands-on session will enable attendees to learn how to optimize computer vision projects with end-to-end version control baked in.", "description": "Petabytes of unstructured data stand as the cornerstone upon which triumphant Machine Learning (ML) models are built.\u00a0One common method for researchers to extract subsets of data to their local environments is by simply using the age-old copy-paste, for model training. This method allows for iterative experimentation, but it also introduces challenges with the efficiency of data management when developing machine learning models, including reproducibility constraints, inefficient data transfer, alongside limited compute power.\r\n\r\nThis is where data version control technologies can help overcome these challenges for computer vision researchers.\u00a0In this workshop we'll cover:\r\n\r\n- How to use open source tooling to version control your data when working with data locally.\r\n- Best practices for working with data, preventing the need to copy data locally, while enabling the training of models at scale directly on the cloud.\u00a0This will be demoed with an OSS stack:\r\n- Langchain\r\n- Tensorflow\r\n- PyTorch\r\n- Keras\r\n\r\nYou will come away with practical methods to improve your data management when developing and iterating upon Machine Learning models, built for modern computer vision research.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EHGG78", "name": "Joe Pringle", "avatar": "https://cfp.pydata.org/media/avatars/EHGG78_ueVerJJ.webp", "biography": "Joe Pringle is VP of Customer Success at lakeFS supporting open source data version control and infrastructure, by providing expertise on data strategy, data science, AI and machine learning. He helps accelerate innovation, and plan and execute data science and machine learning initiatives. He has 20+ years experience helping large enterprises use data to increase impact on important public policy issues including education, health, the environment, and economic development. He also has a passion for focusing technology initiatives on people - and working backwards from understanding end users to identify opportunities to help busy people work faster, smarter, and better.", "public_name": "Joe Pringle", "guid": "60e44fd9-290a-5b2b-81a4-c3c028fe42cc", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/EHGG78/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/WRSZRV/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/WRSZRV/", "attachments": []}, {"guid": "75989e95-668f-5e6a-9430-c5ab48db4055", "code": "M78NZT", "id": 78718, "logo": "https://cfp.pydata.org/media/pydataglobal2025/submissions/M78NZT/displacment_risk_tjiLd1Y.png", "date": "2025-12-11T16:30:00+00:00", "start": "16:30", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-78718-animating-equity-python-dashboards-for-small-town-housing-and-displacement-risk", "url": "https://cfp.pydata.org/pydataglobal2025/talk/M78NZT/", "title": "Animating Equity: Python Dashboards for Small-Town Housing and Displacement Risk", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "This talk demonstrates how open-source Python tools like censusdis, pandas, and folium can be combined to create an interactive, time-enabled dashboard for visualizing economic vulnerability, housing affordability, and displacement risk in small communities. Using Oxford, NC as a case study, the talk showcases a multi-year, multi-indicator mapping project designed to support equitable local planning.", "description": "How do you turn raw census tables into something a small town can actually use to guide housing policy? In this talk, I walk through the design and development of an animated spatial dashboard built entirely with Python, designed to help local residents and planners in Oxford, North Carolina understand where their most vulnerable neighbors live \u2014 and how that vulnerability is changing over time.\r\n\r\nOxford is a rural town facing new development pressure, including non-contiguous annexation and suburban for-sale housing growth. While these changes promise tax base expansion, they also risk pushing out low-income renters, especially in historically underserved neighborhoods. My dashboard uses ACS 5-Year estimates and USDA Food Access data to visualize key indicators like rent burden, SNAP share, senior population, and a normalized displacement risk index \u2014 all animated from 2017 to 2023 using Leaflet.TimeDimension inside folium.\r\n\r\nThe talk is both a case study in data storytelling for place-based equity and a practical demo of working with geospatial census data in Python \u2014 no proprietary software or expensive tools required.\r\n\r\nOutline (with time estimates)\r\n\r\n0\u20135 min \u2014 Context: Why Oxford, NC? The risks of unchecked suburban growth for small cities\r\n\r\n5\u201310 min \u2014 Data: ACS, USDA, and parcel-level value data via censusdis and publicly-available shapefiles\r\n\r\n10\u201320 min \u2014 Dashboard architecture: Python data pipeline, Folium with TimeSliderChoropleth, adding map interactivity, overlays, and popups\r\n\r\n20\u201325 min \u2014 Use case: Displacement risk and the intersection of rent burden, food access, and annexation\r\n\r\n25\u201330 min \u2014 Q&A, tips for adapting the method to other communities\r\n\r\nAudience\r\n\r\nThis talk is intended for:\r\n\r\nData analysts, GIS specialists, and Python developers interested in civic tech or applied geospatial analysis\r\n\r\nPlanners, advocates, and public servants exploring how open data and open tools can improve policy transparency\r\n\r\nAnyone working with small-area census data, especially at the block group or tract level\r\n\r\nAttendees should have a basic familiarity with Python and data visualization libraries (pandas, folium, etc.), but no prior experience with geospatial programming is required.\r\n\r\nTakeaways\r\n\r\nAttendees will learn:\r\n\r\nHow to download and preprocess ACS data at the block group level using Python\r\n\r\nHow to build time-animated choropleth maps using folium + Leaflet.TimeDimension\r\n\r\nHow normalized composite indicators like a displacement risk index can help surface hidden patterns in small towns\r\n\r\nHow interactive mapping can drive better community conversations around housing, equity, and development", "recording_license": "", "do_not_record": false, "persons": [{"code": "RKPJMG", "name": "Matthew Cox", "avatar": "https://cfp.pydata.org/media/avatars/RKPJMG_16qMFOc.webp", "biography": "I'm a hobbyist Python user and data analyst, with a passion for making meaningful visualizations that illustrate the story behind the data. I've been coding to solve my own problems and curiousities for almost five years now, and this is my first application to present a project at a conference.", "public_name": "Matthew Cox", "guid": "c2601809-61da-56ed-a010-f8d2e5538f5e", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/RKPJMG/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/M78NZT/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/M78NZT/", "attachments": []}, {"guid": "4fb25c2f-980a-5821-8817-70b63bc404e6", "code": "VY398A", "id": 79220, "logo": null, "date": "2025-12-11T17:00:00+00:00", "start": "17:00", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-79220-beyond-just-prediction-causal-thinking-in-machine-learning", "url": "https://cfp.pydata.org/pydataglobal2025/talk/VY398A/", "title": "Beyond Just Prediction: Causal Thinking in Machine Learning", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "Most ML models excel at prediction, answering questions like _\"Who will buy our product?\"_ or _\"Which customers are likely to churn?\"_. But when it comes to making actionable decisions, prediction alone can be misleading. Correlation does not imply causation, and business decisions require understanding causal relationships to drive the right outcomes.\r\n\r\nIn this talk, we will explore how causal machine learning, specifically uplift modeling, can bridge the gap between prediction and decision making. Using a real-world use case, we will showcase how uplift modeling helps identify who will respond positively to interventions while avoiding those who they might deter.", "description": "## Audience\r\nThis talk is for data scientists and ML engineers at any level. Basic familiarity with Python and machine learning concepts is helpful but not required.\r\n\r\n## Objective\r\nAttendees will learn when to use causal thinking vs predictive modeling and how to implement uplift models using Python. They will also understand how to apply these techniques across different domains, such as marketing, healthcare, and other relevant fields.\r\n\r\n## Details\r\nPredictive ML models are used everywhere for data-driven decision making across industries. However, accurate forecasts don't always translate to optimal actions.\r\n\r\nWe will begin by exploring the fundamental challenges of deriving actions from model predictions, especially when determining the right audience to target. After that, we will dive into some fundamental concepts of causal inference and how it differs from traditional ML. We will then introduce uplift modeling and cover some key concepts, e.g., treatment effects, counterfactuals, meta-learning approaches, etc. We will see how these elements work together to create causal ML models. \r\n\r\nFinally, we will put theory into practice by building a sample uplift model in Python. We'll walk through each step using real-world intervention data (publicly available), demonstrating how this approach can dramatically improve decision-making and ensure that the interventions target the right audience for the right reasons.\r\n\r\n## Outline\r\n- Introduction and motivation [1 min]\r\n- From correlation to causation [4 min]\r\n   - Correlation vs Causation\r\n   - When do we need a causal angle\r\n- Core causal concepts [4 min]\r\n   - Treatment effects\r\n   - Counterfactuals\r\n   - Intervention problem\r\n- Uplift modeling concepts [5 min]\r\n   - Four types of individual responses to a treatment\r\n   - Meta learning approach\r\n   - T-Learner and S-Learner comparison\r\n- Hands-on case study [10 min]\r\n   - Problem explanation and formulation\r\n   - Predictive model output\r\n   - Causal uplift model in Python\r\n   - Compare targeting strategies and intervention impact\r\n- Evaluation [4 min]\r\n   - Why accuracy or F1 scores don\u2019t work for uplift\r\n   - Uplift curves\r\n   - Qini coefficient\r\n   - Explainability\r\n- Practical Considerations [2 min]\r\n   - A/B testing treatment effects\r\n   - Cross-domain applications", "recording_license": "", "do_not_record": false, "persons": [{"code": "Z7XLKH", "name": "Avik Basu", "avatar": "https://cfp.pydata.org/media/avatars/Z7XLKH_Rey05Sy.webp", "biography": null, "public_name": "Avik Basu", "guid": "ca1347a5-7adf-5604-b533-d0a11b6764be", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/Z7XLKH/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/VY398A/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/VY398A/", "attachments": []}, {"guid": "b8b45d81-f402-518b-9509-06a339d004cd", "code": "ESFUQB", "id": 77428, "logo": null, "date": "2025-12-11T17:30:00+00:00", "start": "17:30", "duration": "00:30", "room": "Analytics, Visualization & Decision Science", "slug": "pydataglobal2025-77428-enhancing-marketplace-competitiveness-a-bayesian-approach-to-modelling-the-cold-start-problem", "url": "https://cfp.pydata.org/pydataglobal2025/talk/ESFUQB/", "title": "Enhancing Marketplace Competitiveness: A Bayesian Approach to modelling the cold start problem", "subtitle": "", "track": "Analytics, Visualization & Decision Science", "type": "Talk", "language": "en", "abstract": "This session shows how Bayesian statistical modeling helps determine when you have collected enough data about new products, so that they are ready for competition. We'll explore: \r\nhow this approach enables efficient decision-making with minimal data\r\nwhy we chose Bayesian over machine learning models\r\nhow we covered for the required assumptions \r\nhow this enables a risk-management approach while providing interpretable results that business stakeholders can understand and trust \r\n\r\n\r\nYou will learn how to identify a Bayesian problem at your company and how to navigate the modelling with real-world data!", "description": "In this session, we will explore the application of Bayesian methodology to address the cold start problem in a recommendation system: determining if there is enough data for a new product in a marketplace to be accurately ranked, or if the product should get further exposure to reach that stage. \r\n\r\nThe target audience of this talk is data analysts of all levels, data practitioners interested in modelling, and professionals working in recommendation systems. \r\n\r\nUnlike traditional machine learning models, Bayesian statistical modelling offers a robust framework for updating probabilities with new evidence, making it particularly suited for dynamic environments like online marketplaces. That way, one can update the learnings on the performance of a new product daily, allowing for efficient decision-making around \u201cshould I keep on exploring this new product or not?\u201d while minimising the traffic investment and enabling a risk-management-based approach. We will also cover how we control for the assumptions that Bayesian requires. \r\n\r\nKey takeaways:\r\n1. Understanding Bayesian Methods: Learn how Bayesian statistics can be applied to real-world business problems, offering a flexible and interpretable approach to decision-making.\r\n\r\n2. Benefits Over Machine Learning: Discover why statistical modelling can be more advantageous than machine learning in certain business contexts, particularly when managing risk, handling sparse data and providing interpretable results to the business.\r\n\r\n3. Practical Application: Learn about the challenges of applying bayesian models in a real marketplace.\r\n\r\nOutline:\r\nIntroduction to the cold-start problem (2 min)\r\nHow we rank incoming activities at GetYourGuide and how modelling could make us more efficient (5 min)\r\nExplaining the model (15 min)\r\nIntro to a Bayesian binomial model (3 min)\r\nControlling for independence among trials (3 min)\r\nDefining the prior (3 min)\r\nDesigning a stop criteria (6 min)\r\nRisk-management: why Bayesian modelling over Machine Learning (5 min)\r\nQuestions (3 min)\r\n\r\nPrerequisites\r\n\r\nLearn what the cold start problem in a recommender system is (https://en.wikipedia.org/wiki/Cold_start_(recommender_systems)).\r\n\r\nGet familiar with Bayesian thinking (https://www.countbayesie.com/blog/2022/2/19/how-to-read-the-news-like-a-bayesian).\r\n\r\nIf you want to go fancy, read this paper: https://arxiv.org/pdf/2410.02126", "recording_license": "", "do_not_record": false, "persons": [{"code": "AGWPER", "name": "Agustin Figueroa Nazar", "avatar": "https://cfp.pydata.org/media/avatars/AGWPER_32NLodw.webp", "biography": "Agus is a Senior Data Analyst at GetYourGuide, where he specializes in using data to identify customer and marketplace needs that could be solved at scale with data products. His work encompasses identifying customer problems, designing experimentation frameworks to measure progress, developing analytical solutions, and translating business requirements into data science projects. Beyond his core responsibilities, Agus is passionate about storytelling, teaching, singing, and almost anything on stage.", "public_name": "Agustin Figueroa Nazar", "guid": "8d3d7321-be43-5c6a-a29f-5aac2d53cf72", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/AGWPER/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/ESFUQB/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/ESFUQB/", "attachments": []}], "Data Engineering & Infrastructure": [{"guid": "529a6d43-2b3f-542b-9c86-6de75b975058", "code": "SRCNAR", "id": 78648, "logo": null, "date": "2025-12-11T11:30:00+00:00", "start": "11:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78648-building-a-lightweight-feature-store-for-electricity-grid-forecasts-with-polars", "url": "https://cfp.pydata.org/pydataglobal2025/talk/SRCNAR/", "title": "Building a Lightweight Feature Store for Electricity Grid Forecasts with Polars", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "Get a firsthand look at how we built a lightweight feature store to accelerate electricity grid forecasting. We\u2019ll cover our decision process, design choices, and implementation using Polars and Google Cloud Storage. Expect lessons learned, real-world bumps, and a clear view of the costs, trade-offs and benefits of our solution.", "description": "In this talk, we\u2019ll share how we built a lightweight, production-ready feature store to support electricity grid forecasting. You'll hear a firsthand account of our journey\u2014from identifying the need to accelerating model prototyping through feature standardization and flexibility.\r\nWe\u2019ll start with a high-level overview of our decision-making process: why we chose to build rather than buy, and the trade-offs we considered. Then, we\u2019ll dive into the architecture of our custom feature store, detailing how we leveraged Polars for fast processing and Google Cloud Storage as a scalable backend.\r\nExpect an honest look at the challenges we faced, the benefits we gained, and the costs we encountered along the way. Whether you're considering building your own feature store or just curious about scaling ML for time series problems, this session will offer practical insights and real-world lessons.", "recording_license": "", "do_not_record": false, "persons": [{"code": "HE33WX", "name": "Robin Troesch", "avatar": "https://cfp.pydata.org/media/avatars/HE33WX_3BnECh6.webp", "biography": "Data Engineer trying to reduce the impact of computing on the climate and helping the energy transition.\r\nWorking at Electricity Maps in Copenhagen (DK) since 2022 first in the data platform team responsible for acquiring grid data. Joined the grid forecast team in 2023.\r\nCurrently working on electricity grid forecasts, enabling people to consume electricity when it's the cleanest and predicting load peaks. Especially interested on how to run large scale infrastructure with a minimal footprint.", "public_name": "Robin Troesch", "guid": "980a4954-4a7d-5332-952b-f1756e2e56ad", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/HE33WX/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/SRCNAR/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/SRCNAR/", "attachments": []}, {"guid": "87c78bd7-88cf-50d4-84cd-3902f1dcbd41", "code": "YN7DYP", "id": 78709, "logo": null, "date": "2025-12-11T12:30:00+00:00", "start": "12:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78709-engineering-large-scale-geospatial-raster-processing-with-xarray-and-dask", "url": "https://cfp.pydata.org/pydataglobal2025/talk/YN7DYP/", "title": "Engineering Large-scale geospatial raster processing with xarray and dask", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "Geospatial analysis often involves harmonizing and processing raster datasets from diverse sources with varying resolutions, coordinate systems, and data formats. This talk demonstrates how you can build efficient, scalable pipelines for zonal statistics extraction using Python\u2019s scientific computing stack, xarray, and dask to handle rasters that would otherwise overwhelm traditional processing approaches. \r\nThrough a real-world case study of processing multi-source geospatial data for small-area estimation of poverty, we\u2019ll explore practical strategies for memory-efficient raster harmonization, parallel computing workflows, and automated statistical aggregation across administrative boundaries.", "description": "This talk addresses a common challenge faced by data scientists, data engineers, researchers, and geospatial analysts working with large-scale geospatial data: how to efficiently process and harmonize raster datasets that exceed memory limits, while maintaining both data integrity and computational performance. Attendees are expected to have a basic familiarity with Python and an understanding of fundamental geospatial concepts.\r\n\r\nI will begin by outlining prevalent issues in geospatial data processing, such as memory constraints when working with large rasters, the difficulty of harmonizing datasets with varying resolutions and projections, and the computational cost of performing zonal statistics across multiple layers. To address these challenges, I will demonstrate how libraries like xarray and rioxarray offer elegant abstractions for geospatial data manipulation, while Dask facilitates out-of-core computation and parallel processing. A technical walkthrough will showcase a flexible pipeline designed to handle key data processing scenarios: downsampling, upsampling, masking, managing missing values, and other steps. \r\n\r\nI will do a live code demonstration from a project involving zonal statistics for small area poverty estimation. This will include processing layers such as population density, distance to healthcare, and nightlights to produce harmonized zonal statistics at administrative level three of a select country. To wrap up, we\u2019ll briefly touch on optimization strategies, including chunking techniques and memory management.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EBDUZK", "name": "CLINTON OYOGO DAVID", "avatar": "https://cfp.pydata.org/media/avatars/EBDUZK_MxOgEn4.webp", "biography": "Clinton Oyogo David is a Data Scientist at Oxford Policy Management, specializing in geospatial analytics, data engineering, dashboard development, and automation. He has led data-intensive projects across Africa and Asia, developing data pipelines, dashboards, and data analysis for various organisations. Clinton combines a background in statistics with a deep interest in scalable data solutions that inform policy and drive impact. His recent work focuses on harmonizing large raster datasets using tools like xarray and Dask to support small area estimation of poverty and sustainable development research.", "public_name": "CLINTON OYOGO DAVID", "guid": "997aabbf-a60b-5762-a641-2687ef970f39", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/EBDUZK/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/YN7DYP/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/YN7DYP/", "attachments": []}, {"guid": "df2994bb-e0b6-5f54-aae7-17bfd93d10f0", "code": "VS8HWU", "id": 78713, "logo": null, "date": "2025-12-11T13:30:00+00:00", "start": "13:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78713-accelerate-deployment-of-your-python-data-science-apps-using-shinyproxy", "url": "https://cfp.pydata.org/pydataglobal2025/talk/VS8HWU/", "title": "Accelerate deployment of your Python data science apps using ShinyProxy", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "ShinyProxy is 100% open-source software to deploy data science apps in an enterprise context. This talk will - for the first time - introduce ShinyProxy to the Python community. We'll start with a realistic example to explore what it takes to deploy a data science app for production use. Throughout the talk, you'll see how ShinyProxy addresses many of the common challenges faced when deploying apps.\r\nThese include authentication, scaling, security (such as TLS), audit logging, version control, reproducibility, and more. The main goal of ShinyProxy is to ensure data scientists can focus on doing science instead of spending time on technical requirements, procedures and maintenance. This talk is tailored for both data scientists and anyone interested in setting up ShinyProxy. No deep technical knowledge is required to follow along. At the end of the talk, you'll know everything to get started with ShinyProxy and to deploy your first app.", "description": "ShinyProxy is already a well known tool to deploy apps built using R and Shiny. This talk will - for the first time - introduce ShinyProxy to the Python community. In the first part of the talk we present how Bob wrote a super useful Python app, but struggles to get it deployed at Bob's company. A first challenge is to get hold of a server with all dependencies and libraries installed. Next, Bob is informed that the app must be protected using TLS and integrated with the existing authentication system. After these first obstacles Bob learns that there are even more requirements and gets stuck. The second part of this talk demonstrates how Bob can solve all these problems using ShinyProxy. For example, using container technology (Docker), Bob has full control on installing dependencies and libraries, while at the same time improving the reproducibility of the setup. This talk is tailored for both data scientists and anyone interested in setting up ShinyProxy. No deep technical knowledge is required to follow along. At the end of the talk, you'll know everything to get started with ShinyProxy and to deploy your first app. ShinyProxy supports almost any web application, including Streamlit, Dash, Voila and Gradio. Therefore, we don't focus on a specific framework. Everything covered in this talk is applicable to your favourite framework.", "recording_license": "", "do_not_record": false, "persons": [{"code": "WYXUBR", "name": "Tobia De Koninck", "avatar": "https://cfp.pydata.org/media/avatars/WYXUBR_LQkRKwa.webp", "biography": "I work as a Software and Infrastructure engineer on open-source tools for data science.", "public_name": "Tobia De Koninck", "guid": "c053d4a2-8ac8-5681-bbd1-fa208c0c968d", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/WYXUBR/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/VS8HWU/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/VS8HWU/", "attachments": []}, {"guid": "45af4168-c49f-59f6-bc53-b11b8888c685", "code": "UKDKZ7", "id": 78625, "logo": null, "date": "2025-12-11T16:30:00+00:00", "start": "16:30", "duration": "00:30", "room": "Data Engineering & Infrastructure", "slug": "pydataglobal2025-78625-bodo-dataframes-a-fast-and-scalable-hpc-based-drop-in-replacement-for-pandas", "url": "https://cfp.pydata.org/pydataglobal2025/talk/UKDKZ7/", "title": "Bodo DataFrames: a fast and scalable HPC-based drop-in replacement for Pandas", "subtitle": "", "track": "Data Engineering & Infrastructure", "type": "Talk", "language": "en", "abstract": "Pandas is a popular library for data scientists but it struggles with large datasets; programs either become too slow or run out of memory. In this talk, we introduce Bodo DataFrames (https://github.com/bodo-ai/Bodo) as a drop-in replacement for the Pandas library that uses high performance computing (HPC) based techniques such as Message Passing Interface (MPI) and JIT compilation for acceleration and scaling. We give an overview of its architecture and explain how it avoids the problems of Pandas (while keeping user code the same), go over concrete examples, and finally discuss current limitations. This talk is for Pandas users who would like to run their code on larger data while avoiding frustrating code rewrites to other APIs. Basic knowledge of Pandas and Python is recommended.", "description": "Despite its popularity for data manipulation tasks, Pandas struggles at scale due to its single threaded execution and significant Python-based overheads. In this talk, we introduce Bodo DataFrames as a solution to scaling Pandas with a single line of code change; simply replace `import pandas as pd` with `import bodo.pandas as pd`.  \r\n\r\nBodo DataFrames transforms Pandas code into lazily evaluated plans, enabling database-quality query optimizations, and runs on a streaming, parallel backend using the Message Passing Interface (MPI) for fast worker-to-worker communication. This design avoids out-of-memory errors and is easily scalable from laptop to large cloud cluster. Unlike other data processing engines, Bodo DataFrames combine powerful techniques from high performance computing (HPC) and databases while remaining fully Pandas compatible.\r\n\r\nWe will present multiple examples and benchmarks demonstrating how to use Bodo DataFrames. The first example will show how to scale a simple program covering functions like reading/writing Parquet files, Series-datetime, merge, and groupby-agg. The next example will demonstrate how to accelerate user defined functions (i.e. map and apply) using Bodo DataFrames builtin support for Just-In-Time (JIT) compilation. The final example will demonstrate how to use Bodo DataFrames support for the Apache Iceberg format, which provides schema evolution and time travel for ever-changing datasets. We will also discuss how Bodo DataFrames falls back to Pandas when it doesn't support all operations of a workload, and planned future work.\r\n\r\nThis talk is designed for users of Pandas; data scientists, data engineers and AI/ML practitioners, who are interested in accelerating and scaling their workloads easily. In addition to a new tool under their belt, attendees will walk away with an understanding of techniques from HPC and databases, unlocking deeper insights into aspects of performance and memory utilization.", "recording_license": "", "do_not_record": false, "persons": [{"code": "7JA8SG", "name": "Scott Routledge", "avatar": "https://cfp.pydata.org/media/avatars/7JA8SG_gu9MYL6.webp", "biography": "Scott is a Software Engineer at Bodo.ai, where he has worked on the performance and reliability of the BodoSQL engine, contributed to the Bodo Just-In-Time Python Compiler, and is currently working on Bodo DataFrames. He earned his undergraduated in computer science from Carnegie Mellon University.", "public_name": "Scott Routledge", "guid": "0433c1ce-3c80-5687-8f3f-b198269c0970", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/7JA8SG/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/UKDKZ7/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/UKDKZ7/", "attachments": []}], "Impact Scholarship Program": [{"guid": "9f03f9c5-ebd0-5249-bec3-2cd80ac5db6f", "code": "SGNMQM", "id": 86183, "logo": null, "date": "2025-12-11T12:00:00+00:00", "start": "12:00", "duration": "01:00", "room": "Impact Scholarship Program", "slug": "pydataglobal2025-86183-how-do-we-create-access-for-those-who-don-t-show-up-in-our-spaces", "url": "https://cfp.pydata.org/pydataglobal2025/talk/SGNMQM/", "title": "How Do We Create Access for Those Who Don\u2019t Show Up in Our Spaces?", "subtitle": "", "track": "Impact Scholarship Program", "type": "Talk", "language": "en", "abstract": "Impact Scholars Program", "description": "Impact Scholars Program", "recording_license": "", "do_not_record": false, "persons": [{"code": "KZQEGF", "name": "Anita Ihuman", "avatar": "https://cfp.pydata.org/media/avatars/KZQEGF_VcrBnjB.webp", "biography": null, "public_name": "Anita Ihuman", "guid": "ffa84524-73ba-5cbe-bf9f-923c201933f7", "url": "https://cfp.pydata.org/pydataglobal2025/speaker/KZQEGF/"}], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/SGNMQM/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/SGNMQM/", "attachments": []}, {"guid": "b089adff-3b14-5b41-8cee-39437d0fc82b", "code": "PXLTKU", "id": 86184, "logo": null, "date": "2025-12-11T17:00:00+00:00", "start": "17:00", "duration": "01:00", "room": "Impact Scholarship Program", "slug": "pydataglobal2025-86184-bof-networking-session", "url": "https://cfp.pydata.org/pydataglobal2025/talk/PXLTKU/", "title": "BoF - networking session", "subtitle": "", "track": "Impact Scholarship Program", "type": "Talk", "language": "en", "abstract": "During the BoF session, you\u2019ll have the opportunity to submit discussion topics at the start of the meeting via the meeting chat. Discussions will take place in smaller, breakout rooms, where you are free to join or switch groups at any time. While we encourage topics related to Data Science, AI, career development, research, and Open Source, the group can explore other topics as long as they align with the Code of Conduct. Let\u2019s collaborate, share ideas, and inspire each other in a dynamic, interactive environment!", "description": "Impact Scholars Program", "recording_license": "", "do_not_record": false, "persons": [], "links": [], "feedback_url": "https://cfp.pydata.org/pydataglobal2025/talk/PXLTKU/feedback/", "origin_url": "https://cfp.pydata.org/pydataglobal2025/talk/PXLTKU/", "attachments": []}]}}]}}}