{"$schema": "https://c3voc.de/schedule/schema.json", "generator": {"name": "pretalx", "version": "2026.1.0.dev0"}, "schedule": {"url": "https://cfp.pydata.org/virginia2025/schedule/", "version": "0.40", "base_url": "https://cfp.pydata.org", "conference": {"acronym": "virginia2025", "title": "PyData Virginia 2025", "start": "2025-04-18", "end": "2025-04-19", "daysCount": 2, "timeslot_duration": "00:05", "time_zone_name": "US/Eastern", "colors": {"primary": "#4C9CB4"}, "rooms": [{"name": "Auditorium 5", "slug": "4548-auditorium-5", "guid": "877b7c2e-8a54-5f91-a994-f14717c5c3be", "description": null, "capacity": null}, {"name": "Auditorium 4", "slug": "4549-auditorium-4", "guid": "a06ff8ae-8e5b-5105-83c2-7600e2120b6c", "description": null, "capacity": null}, {"name": "Auditorium 3", "slug": "4550-auditorium-3", "guid": "45d6d0a8-f4ad-507a-aa78-040a499ffab5", "description": null, "capacity": null}, {"name": "Room 120", "slug": "4551-room-120", "guid": "5037d7fa-4529-5052-8aef-8b35200900e0", "description": null, "capacity": null}, {"name": "Room 130", "slug": "4552-room-130", "guid": "b0c76a60-b6a7-590f-8dfa-5cd4772a399c", "description": null, "capacity": null}, {"name": "Room 140", "slug": "4553-room-140", "guid": "79090300-67a8-5f5d-9aa9-14d91d353fad", "description": null, "capacity": null}], "tracks": [], "days": [{"index": 1, "date": "2025-04-18", "day_start": "2025-04-18T04:00:00-04:00", "day_end": "2025-04-19T03:59:00-04:00", "rooms": {"Auditorium 5": [{"guid": "52b6ed57-61f4-59de-a1f0-b30957c8504e", "code": "DZTLEW", "id": 77456, "logo": null, "date": "2025-04-18T09:15:00-04:00", "start": "09:15", "duration": "00:45", "room": "Auditorium 5", "slug": "virginia2025-77456-keynote-building-ai-first-organizations", "url": "https://cfp.pydata.org/virginia2025/talk/DZTLEW/", "title": "Keynote: Building AI-First Organizations", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "As businesses strive to become AI-first, the pivotal role of AI practitioners extends beyond technical implementation to encompass strategic stewardship. This transition necessitates a profound understanding of organizational goals, data governance, and ethical considerations. By aligning AI initiatives with business objectives, fostering cross-functional collaboration, and addressing challenges such as data privacy and employee adaptation, AI professionals can drive effective transformation. This keynote explores the essential competencies and approaches required for AI practitioners to lead their organizations successfully into an AI-centric future.", "description": "In the quest to become AI-first, organizations face the imperative of aligning technological innovation with strategic business objectives. This transformation requires AI practitioners to evolve into strategic stewards who not only possess technical expertise but also deeply understand organizational goals and the multifaceted challenges of AI implementation. Key considerations include:\r\n\r\n- **Strategic Alignment:** AI initiatives must be closely integrated with the organization's overarching goals. This entails identifying areas where AI can drive significant value, such as enhancing operational efficiency, improving customer experiences, or enabling data-driven decision-making. A clear strategic vision ensures that AI projects are purpose-driven and aligned with business priorities. \r\n- **Data Management:** Treating data as a strategic asset is fundamental. This involves going beyond establishing robust data governance frameworks that ensure data quality, privacy, and security. Strategic data management practices enable leaders realize the monetary value of the organization\u2019s data, build reliable AI models and foster trust among stakeholders.\r\n- **Targeted AI Investment:** Organizations should focus AI development in domains where human capabilities are limited, allowing AI to complement human strengths. Conversely, in areas where humans excel and AI falls short\u2014such as tasks requiring deep creativity, empathy, or complex judgment\u2014investment should prioritize human expertise. This strategic allocation ensures that AI serves as an effective tool without encroaching upon domains where human skills are paramount. \r\n- **Human-AI Interaction Design:** Insights from research on human-machine interaction are vital for designing AI systems that are intuitive and user-friendly. Emphasizing the human-in-the-loop approach ensures that AI tools augment human capabilities, leading to more effective and ethical AI implementations. \r\n- **Ethical Considerations:** Addressing ethical challenges such as data privacy, bias, and regulatory compliance is crucial. Implementing AI responsibly involves proactive measures to mitigate risks and uphold ethical standards, thereby maintaining public trust and safeguarding the organization's reputation. \r\n- **Change Management:** Transitioning to an AI-first organization necessitates effective change management strategies. This includes reskilling and upskilling employees, managing cultural shifts, and addressing potential resistance to change. Empowering employees to work alongside AI technologies fosters a culture of innovation and continuous improvement.\r\n\r\nThis keynote delves into these critical aspects, offering insights into how AI practitioners can become effective stewards of AI strategy. By embracing a holistic approach that encompasses strategic alignment, robust data practices, ethical considerations, and proactive change management, organizations can successfully navigate the complexities of AI adoption and thrive in an AI-centric future.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DYPGYB", "name": "Rajkumar Venkatesan", "avatar": "https://cfp.pydata.org/media/avatars/DYPGYB_0MeBDwE.webp", "biography": "**Rajkumar Venkatesan**\u00a0is the Ronald Trzcinski and John Tyler Professor of Business Administration, and the Co-Academic Director of the LaCross Institute for Ethical AI in Business at the Darden Business School at the University of Virginia. Raj has written about and taught Quantitative Digital Marketing to MBA and executive education students worldwide. His teaching experience and research at Darden translated into the books,\u00a0Cutting Edge Marketing Analytics,\u00a0published by Pearson Education in 2014 and AI Marketing Canvas in 2021. He has published extensively in the Journal of Marketing, Journal of Marketing Research, Marketing Science, Journal of Academy of Marketing Science, International Journal of Research in Marketing, Harvard Business Review, and California Management Review. He serves as an Associate Editor for the Journal of Academy of Marketing Science. He is a recipient of several awards including the Long-Term Impact in B2B Marketing from ISBM, and the Well Fargo Award for course materials development. More than 450,000 individuals have participated in his courses on Coursera. Venkatesan has consulted with large enterprises and startups in the technology, retailing, media, industrial goods, financial services, and life sciences industries. He has developed custom executive education programs and data analytics software for Capital One, CFA Institute, Dr. Reddy Labs, DFW Airports, Explore Learning, ExxonMobil, General Electric, General Dynamics, HBO, IBM, Johnson & Johnson, MAS Holdings, Navy Federal Credit Union, Pitney Bowes, Rosetta Stone, SAP, Teradata, State Farm, Tata Sons, and TEG Analytics.", "public_name": "Rajkumar Venkatesan", "guid": "118fd590-b525-5d6e-bbc8-900e18845af9", "url": "https://cfp.pydata.org/virginia2025/speaker/DYPGYB/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/DZTLEW/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/DZTLEW/", "attachments": []}, {"guid": "18417d5c-039d-58b4-aa88-59cc504b814e", "code": "3YQQ8N", "id": 77258, "logo": null, "date": "2025-04-18T10:20:00-04:00", "start": "10:20", "duration": "00:35", "room": "Auditorium 5", "slug": "virginia2025-77258-making-the-most-of-test-time-compute-in-llms", "url": "https://cfp.pydata.org/virginia2025/talk/3YQQ8N/", "title": "Making the most of test-time compute in LLMs", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Reasoning models like OpenAI's o3 and DeepSeek's R1 herald a new paradigm that leverages test-time compute to solve tasks requiring reasoning. These models represent a departure from traditional LLMs, upending long-held assumptions about them. In this session, we will discuss the different dimensions along which test-time compute can be expended and scaled. We will showcase best practices for prompting reasoning models as well as how to direct test-time compute towards achieving desired results. Finally, we will demonstrate how to train our own reasoning models specific to our domain or use case.", "description": "The objectives of this session are to:\r\n1. Highlight differences between mainstream LLMs and reasoning models\r\n2. Understand test-time compute and the different dimensions along which they can be scaled.\r\n2. Demonstrate experimental results with reasoning models from DeepSeek and OpenAI\r\n3. Learn how to prompt reasoning models effectively.\r\n4.  Showcase how to leverage test time compute at the application level to achieve good results.", "recording_license": "", "do_not_record": false, "persons": [{"code": "EDBNWF", "name": "Suhas Pai", "avatar": "https://cfp.pydata.org/media/avatars/EDBNWF_m3PGbrN.webp", "biography": "Suhas Pai is a NLP researcher and co-founder/CTO at Hudson Labs, a Toronto based Y-combinator backed startup. He is the author of the book 'Designing Large Language Model Applications', published by O'Reilly Media. He has contributed to the development of several open-source LLMs, including being the co-lead of the Privacy working group at BigScience, as part of the BLOOM LLM project. Suhas is active in the ML community, being Chair of the TMLS (Toronto Machine Learning Summit) conference since 2021. He is also a frequent speaker at AI conferences worldwide, and hosts regular seminars discussing the latest research in the field of NLP.", "public_name": "Suhas Pai", "guid": "3f65abdd-e84d-5b42-ae41-eaef6b04b8d8", "url": "https://cfp.pydata.org/virginia2025/speaker/EDBNWF/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/3YQQ8N/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/3YQQ8N/", "attachments": []}, {"guid": "813b3fc4-e2f0-5af4-be6f-5f8564d07224", "code": "JNHA9R", "id": 77060, "logo": null, "date": "2025-04-18T10:55:00-04:00", "start": "10:55", "duration": "00:35", "room": "Auditorium 5", "slug": "virginia2025-77060-evaluating-llms-at-s-p-global-building-a-robust-evaluation-framework-for-genai-productivity-tools", "url": "https://cfp.pydata.org/virginia2025/talk/JNHA9R/", "title": "Evaluating LLMs at S&P Global: Building a Robust Evaluation Framework for GenAI Productivity Tools", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Discover how S&P Global built an enterprise-grade evaluation framework that transformed our GenAI deployment process. Through automated monitoring, expert validation, & continuous testing, we\u2019ve streamlined the document integration step of our RAG tools, while ensuring our AI tools maintain consistent quality and reliability.", "description": "In this talk, we will provide an in-depth look at how S&P Global built a comprehensive and reliable evaluation framework for our Generative AI (GenAI)-powered internal productivity tools, with a focus on our Market Intelligence (MI) Sales Assistant application.\r\n\r\nWe will begin by discussing the unique challenges of evaluating large language models (LLMs) and the importance of a robust evaluation strategy, especially for Retrieval Augmented Generation (RAG)-based systems. We\u2019ll then dive into the key components of our framework:\r\n\r\n\u2022 Metrics: We thoughtfully combine traditional statistical metrics like accuracy, precision, and latency with LLM-specific metrics such as answer relevance, faithfulness to source, and hallucination detection. We\u2019ll explain each metric and its role in assessing model performance and talk about how custom metrics are often necessary in LLM applications.\r\n\r\n\u2022 Question-Answer Pair Generation: We\u2019ll share our process for generating diverse and representative question-answer pairs, including the models used, quality control measures, and lessons learned around promoting diversity in evaluation data.\r\n\r\n\u2022 Ground Truth Creation: Our framework heavily involves subject matter experts (SMEs) to create and validate ground truth data. We\u2019ll detail our process for engaging SMEs , documenting and versioning ground truth, and maintaining high standards.\r\n\r\n\u2022 Evaluation Implementation: We\u2019ll provide a technical overview of our framework, built using the MLflow library. We\u2019ll cover our daily sampling process for continuous monitoring, our comprehensive testing triggered by new releases and document updates, and cost considerations. We will also talk broadly on other tools available outside of MLFlow.\r\n\r\nThroughout the talk, we\u2019ll share real-world results and concrete lessons learned, such as effective strategies for question generation, SME engagement, and scaling evaluation processes. We\u2019ll demonstrate our MI Sales Assistant and evaluation dashboard to illustrate the framework in action.\r\n\r\nAttendees will come away with a clear understanding of what it takes to implement a robust evaluation framework for a real-world GenAI application. They\u2019ll learn proven best practices and potential pitfalls, equipping them to ensure their own AI systems consistently deliver value.", "recording_license": "", "do_not_record": false, "persons": [{"code": "TFU3DE", "name": "MacKenzye Leroy", "avatar": null, "biography": "MacKenzye Leroy is a Lead Data Scientist within S&P Global's newly formed MI Enterprise Technology & Internal Productivity Team, where he focuses on developing enterprise AI solutions to transform business operations. Working closely with stakeholders across Sales, Commercial, Legal, and Marketing, he implements AI-powered productivity solutions. \r\n\r\nMacKenzye combines his M.S. in Data Science from the University of Virginia with his physics background to solve complex business challenges. His expertise spans artificial intelligence, machine learning, data pipeline development, anomaly detection, statistical analysis, and full-stack data science implementation - from initial concept through production deployment.\r\n\r\nWhen not working with data, MacKenzye can be found exploring mountain trails by foot, bike, or snowboard, reading, or cheering on his beloved New York Mets.", "public_name": "MacKenzye Leroy", "guid": "f17ef6cc-fc17-520c-a183-80bbb59e9fc0", "url": "https://cfp.pydata.org/virginia2025/speaker/TFU3DE/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/JNHA9R/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/JNHA9R/", "attachments": []}, {"guid": "8bb982ce-ef2a-5c5e-ba3c-ec15cf25e88e", "code": "FHY93D", "id": 77386, "logo": null, "date": "2025-04-18T11:30:00-04:00", "start": "11:30", "duration": "00:35", "room": "Auditorium 5", "slug": "virginia2025-77386-maximizing-multimodal-exploring-the-search-frontier-of-text-to-image-models-to-improve-visual-find-ability-for-creatives", "url": "https://cfp.pydata.org/virginia2025/talk/FHY93D/", "title": "Maximizing Multimodal: Exploring the search frontier of text-to-image models to improve visual find-ability for creatives", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Text-to-Image models, like CLIP, have brought us into a new frontier of visual search. Whether it's searching by circling a section of a photo or powering image generators like Dalle-E the gap between pixels and tokens has never been smaller. This talk discusses how we are improving search and empowering designers with these models at Eezy, a stock art marketplace.", "description": "Objective:\r\nDescribe where and how we have improved the search experience in our product with open source multi-modal models and libraries. Real world examples from the things we have shipped (and decided not to ship) to production.\r\n\r\nOutline:\r\n1. Cover the architecture of open source hybrid search stack at Eezy (Elasticsearch, FAISS, PyTorch)\r\n2. Demo the capabilities and limitations of openCLIP for retrieval embeddings\r\n3. Highlight meaningful stops on our product roadmap from the last 2 years of deploying features into production.\r\n4. Describe notable missteps and surprises uncovered along the way, so people see it's not all roses in the AI powered future.\r\n5. Demo of BORGES, a novel search framework that allows users to search with multiple queries for a nuanced navigation of the catalog to find exactly what they need\r\n\r\nAudience:\r\n- Anyone curious about real-world results we have extracted from AI\r\n- Search practitioners developing hybrid search applications\r\n- PyTorch and transformers enthusiasts interested in applications in vector space\r\n- This talk is not overtly technical and does not require a background in ML/search/AI. The most math required is some multiplication and division, you got it, jump in.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MK7TKU", "name": "Nathan Day", "avatar": "https://cfp.pydata.org/media/avatars/MK7TKU_NVc0uWt.webp", "biography": "I dance in vector space.", "public_name": "Nathan Day", "guid": "2cc6fc85-eecd-5603-a7af-e91573fcf8dc", "url": "https://cfp.pydata.org/virginia2025/speaker/MK7TKU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/FHY93D/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/FHY93D/", "attachments": []}, {"guid": "4eb1b35f-738d-5e39-b31c-6878661537ff", "code": "XEBBH7", "id": 77075, "logo": null, "date": "2025-04-18T12:05:00-04:00", "start": "12:05", "duration": "00:30", "room": "Auditorium 5", "slug": "virginia2025-77075-fine-tuning-embeddings-for-semantic-caching", "url": "https://cfp.pydata.org/virginia2025/talk/XEBBH7/", "title": "Fine tuning embeddings for semantic caching", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Large Language Models (LLMs) have opened new frontiers in natural language processing but often come with high inference costs and slow response times in production. In this talk, we\u2019ll show how semantic caching using vector embeddings\u2014particularly for frequently asked questions\u2014can mitigate these issues in a RAG architecture. We\u2019ll also discuss how we used contrastive fine-tuning methods to boost embedding model performance to accurately identify duplicate questions. Attendees will leave with strategies for reducing infrastructure costs, improving RAG latency, and strengthening the reliability of their LLM-based applications. *Basic familiarity with NLP or foundation models is helpful but not required.*", "description": "# Who Should Attend?\r\nThis talk is designed for AI engineers and researchers interested in building with LLMs in production. Attendees with a basic understanding of NLP and RAG systems will benefit most, but the concepts and demonstrations will be approachable for a general technical audience.\r\n\r\n# Why It\u2019s Interesting?\r\nAs organizations incorporate LLMs into real-world products, they grapple with inference compute demands and sluggish response times. Semantic caching offers a pragmatic solution: once you identify frequently asked questions (or reoccurring queries), you can serve results from a cache rather than running a fresh, computationally expensive inference every time. This lowers cost and latency. Moreover, using various fine-tuning methods on the retrieval models improves the accuracy of \u201cquestion deduplication,\u201d ensuring cache hits are matched reliably.\r\n\r\n# Key Takeaways\r\n- Semantic Caching Fundamentals: How to design and implement a caching layer tailored for question-answering or conversational systems (RAG).\r\n- Embedding Fine-Tuning: An overview of contrastive methods to improve embedding models\u2019 ability to detect near-duplicate or semantically similar queries.\r\n- Practical Insights: Best practices for integrating semantic caching in production, along with tips for monitoring performance and keeping infrastructure costs down.\r\n- Real world examples.\r\n\r\n# Background Knowledge\r\n- Minimal NLP/ML Knowledge: Familiarity with embeddings, vector similarity, and basic model inference is helpful.\r\n- Basic Software Engineering: Familiarity with productionizing ML workflows will help contextualize the caching strategy.\r\n\r\n# Talk Outline (30 minutes)\r\n1. Introduction to LLM challenges in production (high inference cost, slow responses) with real world examples.\r\n2. Overview of semantic caching: concepts, benefits, and common pitfalls.\r\n3. Improving cache hit rates with contrastive fine-tuning: what it is and how it enhances embedding models.\r\n4. Demo of improving duplicate question detection.\r\n5. Recap and system architecture review.\r\n6. Share resources for further learning (GitHub links, additional reading, etc.)\r\n\r\nBy the end of this session, attendees will have a clear roadmap for employing semantic caching and contrastive fine-tuning to reduce costs and improve performance in LLM-powered applications. We look forward to sharing our experiences and answering your questions!", "recording_license": "", "do_not_record": false, "persons": [{"code": "GM398V", "name": "Tyler Hutcherson", "avatar": null, "biography": "Tyler leads the Applied AI Engineering group at Redis, working hands-on with customers and partners on real-time GenAI and ML workloads. Previously, Tyler led ML Engineering at a early-stage eCommerce startups building novel search & recommendation systems graduated from the University of Virginia with a BS in Physics and MS in Data Science. His passions involve MLOPs system design and working with LLMs to solve actual problems. He also enjoys distilling myths and building bridges in the tech community through knowledge and resource sharing.\r\n\r\nTyler and his wife Cynthia reside in Richmond, VA where they enjoy hosting friends, family, and soaking in the city's history, landmarks, nature, food and creative scene.", "public_name": "Tyler Hutcherson", "guid": "ed12f6f1-01db-5e66-8517-8ca6b1c6853b", "url": "https://cfp.pydata.org/virginia2025/speaker/GM398V/"}, {"code": "XYFQML", "name": "Srijith Rajamohan", "avatar": null, "biography": "Dr. Srijith Rajamohan currently leads AI Research at Redis for building efficient and scalable retrieval systems with GenAI. Prior to this role, he has led the data science effort for Sage Copilot and also led the team that created and deployed domain-specific LLMs to address the deficiencies of off-the-shelf models for accounting. He also had stints at Databricks where he led the data science developer advocacy efforts and at Nerdwallet as a data scientist. Before making the switch to the tech sector, he spent about six years in academia as a computational scientist at Virginia Tech.", "public_name": "Srijith Rajamohan", "guid": "77c9a411-0791-578b-afd7-d953bab9e74d", "url": "https://cfp.pydata.org/virginia2025/speaker/XYFQML/"}, {"code": "T8RYM3", "name": "Waris Gill", "avatar": null, "biography": "I am a final-year PhD student in the Computer Science department at Virginia Tech. Currently, I am interning at Redis as a Machine Learning Engineer.", "public_name": "Waris Gill", "guid": "ce312eed-037b-51db-9c4a-63522994c60c", "url": "https://cfp.pydata.org/virginia2025/speaker/T8RYM3/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/XEBBH7/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/XEBBH7/", "attachments": []}, {"guid": "87a9e581-6319-599d-be45-a5fa0db6d639", "code": "NEKHFV", "id": 77520, "logo": null, "date": "2025-04-18T13:35:00-04:00", "start": "13:35", "duration": "01:00", "room": "Auditorium 5", "slug": "virginia2025-77520-panel-principles-for-effective-and-successful-data-scientists", "url": "https://cfp.pydata.org/virginia2025/talk/NEKHFV/", "title": "Panel: Principles for Effective and Successful Data Scientists", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "What truly makes a data scientist effective in their job and career? Come hear from our panel of data scientists, each with their unique pathway into data science, discuss the principles that matter: pathways to data science, translating business problems, and what technical expertise means for data science. Grow your insight into becoming the kind of data scientist people trust to solve the right problems, the right way.", "description": "This conversational panel brings together experienced data science professionals to explore what truly matters for success in the field beyond what's typically learned in educational settings.\r\n\r\nOur panelists will share insights on:\r\n* The \"real world\" skills critical to data science that aren't typically taught in academic programs\r\n* Foundations of data science: the core understanding data, the mechanics of models, and the importance of considering MLOps as a Data Scientist\r\n* How to stand out in data science job opportunities and the pathways in and through data science.\r\n* Practical advice for students, job seekers, and career changers looking to enter or advance in data science\r\n\r\nThis session will be valuable for students, early-career data scientists, those interviewing for data science roles, professionals seeking promotions, and individuals looking to transition from other fields into data science.\r\n\r\nThe panel will include time for audience Q&A, allowing attendees to ask specific questions around each major discussion point during the process.", "recording_license": "", "do_not_record": false, "persons": [{"code": "Q7AVEA", "name": "Aaron Baker", "avatar": "https://cfp.pydata.org/media/avatars/Q7AVEA_Q3DRaW8.webp", "biography": "Data Scientist and Statistician working at CapTech Consulting with 8 years experience in business. A coach, mentor, teacher, and expert in the field of data science and a diverse love of learning across the fields of data science, psychology, business, and foreign literature.", "public_name": "Aaron Baker", "guid": "06ec262c-74ef-5700-aa84-9c76cebcb491", "url": "https://cfp.pydata.org/virginia2025/speaker/Q7AVEA/"}, {"code": "B3XXAU", "name": "Renee Teate", "avatar": "https://cfp.pydata.org/media/avatars/B3XXAU_Rh19V2L.webp", "biography": "Renee Teate is the Senior Director of Data Science at higher ed tech company HelioCampus and author of SQL for Data Scientists (Wiley). Many people know her as the host of the Becoming a Data Scientist Podcast, or as \"Data Science Renee\" from BlueSky (previously becomingdatasci on Twitter).\r\n\r\nRenee lives in Harrisonburg, VA, and is a graduate of JMU and UVA. She has worked with data her entire career, as a database designer, data analyst, data scientist, and director. She enjoys chatting with people looking to \"break into\" data careers, or looking to build their data science network.", "public_name": "Renee Teate", "guid": "ed6a1eeb-d9d2-599c-be91-bb7af22f10ab", "url": "https://cfp.pydata.org/virginia2025/speaker/B3XXAU/"}, {"code": "MFW8FA", "name": "David Der", "avatar": "https://cfp.pydata.org/media/avatars/MFW8FA_ErIZyif.webp", "biography": "Chief AI Officer, computer scientist", "public_name": "David Der", "guid": "2f4f4725-dca3-5b72-89ee-174e9fa4904b", "url": "https://cfp.pydata.org/virginia2025/speaker/MFW8FA/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/NEKHFV/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/NEKHFV/", "attachments": []}, {"guid": "c50e1cc6-3bca-5232-817b-3bc38b10a35b", "code": "HKZH7C", "id": 77117, "logo": null, "date": "2025-04-18T14:55:00-04:00", "start": "14:55", "duration": "00:35", "room": "Auditorium 5", "slug": "virginia2025-77117-addressing-climate-change-with-ai", "url": "https://cfp.pydata.org/virginia2025/talk/HKZH7C/", "title": "Addressing Climate Change with AI", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "This talk will survey how AI is currently used to address climate change, and describe possible future use cases.  This high-level overview will touch on various aspects of climate change (e.g. energy, transportation, land use), of AI (e.g. image processing, reinforcement learning, LLMs), and of their intersection.  The talk will conclude with resources for learning more about this area, and suggestions for contributing to current and future efforts.", "description": "Overview: \r\nAI is profoundly shaping society.  An equally forceful phenomenon is climate change; humanity is already feeling the impacts, and temperatures and greenhouse gas emissions keep rising.  The goal of this talk is to briefly survey the many ways AI is and can be used to address climate change, and to provide pointers to anyone interested in contributing to the effort.  The intended audience is anyone with an interest in this intersection of AI and climate change.\r\n\r\nClimate Change: \r\nWe\u2019ll briefly discuss aspects of climate change which AI is tackling, such as mitigating emissions from the five most carbon-intensive sectors: energy, manufacturing, land use, transportation, and buildings / infrastructure.  We\u2019ll also look at AI\u2019s application to other areas such as climate modeling, carbon capture, climate finance, and reducing the carbon footprint of AI itself.  \r\n\r\nAI: \r\nWe\u2019ll see how a number of AI methods can be used to address climate change, including: various neural net architectures (e.g. convolutional, recurrent, graph), LLMs, reinforcement learning, generative AI, neural operators, causality, and natural language processing.\r\n\r\nTheir intersection: \r\nWe\u2019ll display a matrix of climate change domains and selected AI methods that can address them, as a guide to tractable areas to tackle.  We\u2019ll look at unsolved climate-related areas where AI could potentially help.  We\u2019ll conclude by providing resources for anyone wishing to learn more about this intersection, and for technologists wanting to plug into an existing community to contribute to this effort.", "recording_license": "", "do_not_record": false, "persons": [{"code": "9RYVNW", "name": "Dan Loehr", "avatar": "https://cfp.pydata.org/media/avatars/9RYVNW_NiLuGnW.webp", "biography": "Dan Loehr earned his bachelor's in Computer Science from Cornell and a master's and PhD from Georgetown in Computational Linguistics. He 30 years experience leading large organizations in R&D and application of Machine Learning, AI, Natural Language & Speech Processing, and related fields. He has numerous publications and extensive experience teaching these topics at the graduate level. He's currently teaching a course on AI & Climate Change in Georgetown's Master of Science in Data Science and Analytics program", "public_name": "Dan Loehr", "guid": "18b829eb-9abb-50c8-b8f6-186b872ca9d8", "url": "https://cfp.pydata.org/virginia2025/speaker/9RYVNW/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/HKZH7C/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/HKZH7C/", "attachments": []}, {"guid": "6b34a585-a19c-5924-9a95-19863f8ef782", "code": "SF7WAK", "id": 77427, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/SF7WAK/cyberpunk-bike_FiPcEpP_WZ9eRbK.png", "date": "2025-04-18T15:30:00-04:00", "start": "15:30", "duration": "00:35", "room": "Auditorium 5", "slug": "virginia2025-77427-real-time-fitness-leaderboards-with-open-source-moose", "url": "https://cfp.pydata.org/virginia2025/talk/SF7WAK/", "title": "Real-Time Fitness Leaderboards with Open-Source Moose", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Ever wished you could power live leaderboards for fitness challenges or dynamically award wellness badges in real time? Traditional OLTP systems often buckle under the pressure of continuous writes and aggregate reads. In this talk, we\u2019ll explore how Moose, an open-source OLAP platform, enables rapid ingestion and lightning-fast queries on health and workout data. We\u2019ll walk through a demo of creating real-time fitness leaderboards, awarding achievement badges, and using Python-based tools for data ingestion and visualization. Attendees will learn how an OLAP approach streamlines the architecture for modern wellness and health applications.", "description": "What & Why\r\nHealth and fitness applications produce constant streams of data, from workout logs and step counts to heart-rate measurements and sleep metrics. Crafting a dynamic, user-facing experience\u2014like up-to-the-minute leaderboards or automated badge award systems\u2014requires real-time data access and frequent aggregations. Traditional OLTP databases can stall under heavy reads and writes, making it tough to maintain a snappy user experience.\r\n\r\nEnter Moose, an open-source analytics engine built around a columnar architecture. With Moose, developers and data teams can:\r\n\r\nIngest large volumes of real-time data from wearables, apps, and sensors.\r\nRun near-instantaneous aggregations to power live dashboards or personal health insights.\r\nScale analytics cost-effectively thanks to Moose\u2019s open-source foundation and Python-friendly ecosystem.\r\nPractical Use Case: Real-Time Fitness Leaderboards\r\nWe\u2019ll demonstrate how to build a workout leaderboard that updates in real time as users complete activities. We\u2019ll also show how to apply custom rules for awarding achievement badges, ensuring that your application can both process and surface analytics-driven insights at scale.\r\n\r\nWho Should Attend\r\nData & Analytics Engineers: Seeking solutions to handle large volumes of health/wellness data with frequent aggregations.\r\nDevelopers/Architects: Building real-time or near-real-time consumer apps that rely on fast analytics.\r\nProduct Managers & Tech Leads: Interested in creating engaging features like live dashboards and automatic badge systems within their wellness offerings.\r\nHealth & Fitness Enthusiasts: Looking to understand how data architecture can enhance user engagement and personalized metrics.\r\nA basic understanding of databases, Python data tools, and event streams (e.g., from wearable devices) is helpful but not required.", "recording_license": "", "do_not_record": false, "persons": [{"code": "MFW8FA", "name": "David Der", "avatar": "https://cfp.pydata.org/media/avatars/MFW8FA_ErIZyif.webp", "biography": "Chief AI Officer, computer scientist", "public_name": "David Der", "guid": "2f4f4725-dca3-5b72-89ee-174e9fa4904b", "url": "https://cfp.pydata.org/virginia2025/speaker/MFW8FA/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/SF7WAK/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/SF7WAK/", "attachments": []}, {"guid": "7db160f5-abf9-5dd4-855c-3ae4448339d8", "code": "D3Z7XN", "id": 77506, "logo": null, "date": "2025-04-18T16:05:00-04:00", "start": "16:05", "duration": "01:00", "room": "Auditorium 5", "slug": "virginia2025-77506-panel-bridging-the-gap-collaborative-approaches-to-data-science", "url": "https://cfp.pydata.org/virginia2025/talk/D3Z7XN/", "title": "Panel: Bridging the Gap: Collaborative Approaches to Data Science", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "During this expert panel, we'll explore the critical intersections of data science, engineering, and stakeholder engagement in today's organizations. This discussion will address how to break down silos between technical disciplines, establish effective collaboration models, create rapid experimentation frameworks, and successfully transition projects from exploration to production. Our panelists bring diverse perspectives on building integrated teams that balance innovation with enterprise standards while delivering real value.", "description": "This panel brings together practitioners and leaders to discuss the evolving landscape of data science collaboration and implementation. As organizations face increasing pressure to derive value from AI/ML initiatives, the traditional boundaries between disciplines are being reexamined and redefined.\r\n\r\nOur panelists will explore:\r\n\r\n- Breaking down isolation between data scientists, MLOps engineers, developers, and other stakeholders\r\n- Creating effective frameworks for rapid experimentation that balance innovation with enterprise standards\r\n- Establishing robust handoff processes for transitioning models from exploration to production\r\n- Bridging cultural divides between the explorative nature of data science and the engineering mindset of MLOps\r\n- Practical strategies for cross-functional collaboration that leverages complementary skills\r\n- Managing stakeholder expectations and improving communication with non-technical audiences\r\n\r\nThis discussion is designed for data professionals at all levels\u2014from individual contributors to team leaders and executives\u2014who are navigating the challenges of modern data science implementation. The panel will address both technical and organizational aspects of successful data science teams.\r\n\r\nThe session will include time for audience Q&A, allowing attendees to engage directly with panelists about their specific challenges in building collaborative data science environments.", "recording_license": "", "do_not_record": false, "persons": [{"code": "DDEMZ8", "name": "Thomas  Loeber", "avatar": null, "biography": "Thomas is a senior machine learning engineer at GoHealth, where he builds and productionizes GenAI models. Previously, he worked in consulting and at a technology startup, focusing on MLOps adoption. He originally came from the statistics and data science side, but has also worked in software and data engineering, searching for lessons from these more mature disciplines for how to create maintainable and scalable software systems. Now, Thomas is passionate about integrating these diverse insights to build robust ML systems.", "public_name": "Thomas  Loeber", "guid": "5d35e809-c324-5470-9543-b37413d358b1", "url": "https://cfp.pydata.org/virginia2025/speaker/DDEMZ8/"}, {"code": "TFUHUG", "name": "Manikandarajan Shanmugavel", "avatar": "https://cfp.pydata.org/media/avatars/TFUHUG_JB5O6Kk.webp", "biography": "With over a couple of decades of experience in Information Technology, I have worked on groundbreaking technologies like Cloud and Machine Learning and witnessed their impact on the business and society. I am currently working as an Associate Director in Software development at S&P Global, one of the leaders in Financial Services. I am leading a team that contributes to the AI initiatives of S&P Global. I also hold a Masters degree from UVA in Data Science\r\n\r\nLinkedIN: https://www.linkedin.com/in/mani-shanmugavel/\r\n\r\nMedium: https://medium.com/@manikrajan", "public_name": "Manikandarajan Shanmugavel", "guid": "f2ffc436-400a-5848-91b9-78ed5b4e5229", "url": "https://cfp.pydata.org/virginia2025/speaker/TFUHUG/"}, {"code": "B3XXAU", "name": "Renee Teate", "avatar": "https://cfp.pydata.org/media/avatars/B3XXAU_Rh19V2L.webp", "biography": "Renee Teate is the Senior Director of Data Science at higher ed tech company HelioCampus and author of SQL for Data Scientists (Wiley). Many people know her as the host of the Becoming a Data Scientist Podcast, or as \"Data Science Renee\" from BlueSky (previously becomingdatasci on Twitter).\r\n\r\nRenee lives in Harrisonburg, VA, and is a graduate of JMU and UVA. She has worked with data her entire career, as a database designer, data analyst, data scientist, and director. She enjoys chatting with people looking to \"break into\" data careers, or looking to build their data science network.", "public_name": "Renee Teate", "guid": "ed6a1eeb-d9d2-599c-be91-bb7af22f10ab", "url": "https://cfp.pydata.org/virginia2025/speaker/B3XXAU/"}, {"code": "SNT3VA", "name": "Christopher N. Eichelberger", "avatar": "https://cfp.pydata.org/media/avatars/SNT3VA_udS1yqZ.webp", "biography": "Chris has more than 30 years experience in the space, from analytics to senior management. He learned long ago that it is people skills, not his technical ability nor his fashion sense, that would help him make an impact. He remains glad that every day is a surprise.", "public_name": "Christopher N. Eichelberger", "guid": "4f6518b0-d2e7-5938-8f7c-9e738c56cc19", "url": "https://cfp.pydata.org/virginia2025/speaker/SNT3VA/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/D3Z7XN/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/D3Z7XN/", "attachments": []}], "Auditorium 4": [{"guid": "b5129226-a1da-535f-89e9-30d2e7d85d7e", "code": "RBYY9R", "id": 77050, "logo": null, "date": "2025-04-18T10:20:00-04:00", "start": "10:20", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77050-practical-applications-of-apache-arrow", "url": "https://cfp.pydata.org/virginia2025/talk/RBYY9R/", "title": "Practical Applications of Apache Arrow", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Data system interoperability remains a significant challenge in open source ecosystems, with high costs in development time and resources when moving data across complex infrastructures. The Apache Arrow project offers a standardized solution to reduce these integration challenges.\r\n\r\nWill Ayd (Apache Arrow Committer and pandas maintainer) and Matt Topol (Apache Arrow PMC Member and author of \"In Memory Analytics with Apache Arrow\") will discuss how Apache Arrow is changing the data landscape. A brief overview of Arrow standards will be provided, while also reviewing real world implementations of where the Arrow specification has driven down the cost of data interoperability.", "description": "The Apache Arrow project has been drastically improving the way analytical tools perform, interoperate, and scale. However, as Arrow is primarily used by developers, much of those improvements are happening \"behind the scenes,\" leaving many uninformed as to what exactly Apache Arrow is.\r\n\r\nIn this talk, we will provide a more formal definition of Apache Arrow, and discuss its various components that collectively are helping to revolutionize the data landscape. We will also take some time to explore how popular Python packages like pandas, polars, and pantab have been leveraging Apache Arrow for interoperability between utilities, while also having an open discussion as to what can still be done.\r\n\r\nBy the end of this talk, users will have an appreciation of how Apache Arrow is powering their Python (and non-Python!) libraries today, and how it will shape the data landscape going forward. Topics like Arrow Flight, Arrow Flight SQL, Arrow ADBC, and nanoarrow will be discussed, and attendees will gain a deeper understanding of how these technologies are evolving the way data is used in embedded environments, relational databases, HTTP exchanges, AI applications, and more.", "recording_license": "", "do_not_record": false, "persons": [{"code": "YYSMTU", "name": "William Ayd", "avatar": "https://cfp.pydata.org/media/avatars/YYSMTU_ZDbJlqh.webp", "biography": "Will Ayd is the author of the Pandas Cookbook, Third Edition, and has served as a maintainer of the pandas project since 2018. Will is also a Committer to the Apache Arrow project, and has helped improve countless more open source data libraries.\r\n\r\nIn his day job, Will helps clients in the Retail and Apparel spaces optimize cloud data platforms in AWS and GCP, while also providing strategy and training around the use of open source technology in enterprise settings.", "public_name": "William Ayd", "guid": "5cb82441-867f-56be-9672-d716e1da7ce7", "url": "https://cfp.pydata.org/virginia2025/speaker/YYSMTU/"}, {"code": "ESKHPZ", "name": "Matthew Topol", "avatar": "https://cfp.pydata.org/media/avatars/ESKHPZ_7yQEC7q.webp", "biography": "Hailing from the faraway land of Brentwood, NY and currently residing in the rolling hills of Connecticut, Matt Topol has always been passionate about software. After graduating from Brooklyn Polytechnic (now NYU-Poly), he joined FactSet Research Systems, Inc. in 2009 developing financial software. In the time since, Matt has worked in infrastructure and application development, has lead development teams, and architected large-scale distributed systems for processing analytics on financial data. Matt is a PMC member for the Apache Arrow project, frequently enhancing the Golang library among other enhancements and helping to grow the Arrow Community. Recently, Matt wrote the first and only book on Apache Arrow \"In-Memory Analytics with Apache Arrow\" and joined Voltron Data in order to work on the Apache Arrow libraries full time and grow the Arrow Golang community.\r\n\r\nIn his spare time, Matt likes to bash his head against a keyboard, develop/run delightfully demented games of fantasy for his victims--er--friends, and share his knowledge with anyone interested who'll listen to his rants.", "public_name": "Matthew Topol", "guid": "1b97e2ad-76ee-53ac-bf3b-f8e294851f66", "url": "https://cfp.pydata.org/virginia2025/speaker/ESKHPZ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/RBYY9R/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/RBYY9R/", "attachments": []}, {"guid": "18bc421a-9ad8-5e7d-bc2f-ea7f1c51c6a4", "code": "8GSQPK", "id": 77070, "logo": null, "date": "2025-04-18T10:55:00-04:00", "start": "10:55", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77070-data-wrangling-with-duckdb", "url": "https://cfp.pydata.org/virginia2025/talk/8GSQPK/", "title": "Data wrangling with DuckDB", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Learn how to wrangle data in Python with DuckDB, a fast, open source, in-process analytical SQL database!", "description": "Learn how to use DuckDB to process data in python! In the era of \"big data,\" many data practitioners immediately reach for distributed computing solutions when facing large datasets. Modern hardware capabilities combined with efficient tools like DuckDB make this much less necessary than a few years ago. This talk will demonstrate how to effectively wrangle data using DuckDB in Python, offering a powerful alternative to Pandas and Spark for the majority of data science workflows.\r\n\r\nThis session will cover:\r\n\r\n- Understanding DuckDB's architecture and its integration with the Python ecosystem\r\n- Practical examples of migrating from pandas to DuckDB.\r\n- Performance benchmarks comparing DuckDB against pandas and other popular Python data processing methods\r\n- Real-world scenarios where DuckDB shines, including handling larger-than-memory datasets\r\n- Discussion of the \"shrinking size\" of big data and when to consider DuckDB versus distributed computing solutions\r\n\r\nThis talk is aimed at Python data practitioners who regularly work with medium to large datasets (100MB-100GB) and are looking to optimize their data processing workflows. The presentation will include both conceptual explanations and hands-on code examples.", "recording_license": "", "do_not_record": false, "persons": [{"code": "R7MHQV", "name": "Will Angel", "avatar": "https://cfp.pydata.org/media/avatars/R7MHQV_brVeBli.webp", "biography": "Will Angel is a Data Solution Architect at Excella, leading data teams to help our clients solve data problems. Will is the author of Virtual Power: The Future of Energy Flexibility, an organizer for the Data Visualization and Data Engineers DC Meetups, and the executive director at Data Community DC, a 501c3 nonprofit dedicated to data education in the national capital area. In his free time, Will enjoys wildlife photography, gardening, reading, cooking, art, DIY electronics, and traveling.", "public_name": "Will Angel", "guid": "a8a8bda1-6706-5528-a581-1f3a7ca36fee", "url": "https://cfp.pydata.org/virginia2025/speaker/R7MHQV/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/8GSQPK/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/8GSQPK/", "attachments": []}, {"guid": "8fa4404b-5508-5a40-9e9c-49ce8ee1762f", "code": "UDQZBM", "id": 77522, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/UDQZBM/Screenshot_2025-04-19_a_Fxttayq.png", "date": "2025-04-18T11:30:00-04:00", "start": "11:30", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77522-zero-code-change-gpu-powered-graph-analytics-with-networkx-and-cugraph", "url": "https://cfp.pydata.org/virginia2025/talk/UDQZBM/", "title": "Zero Code Change GPU-Powered Graph Analytics with NetworkX and cuGraph", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Graphs are a fundamental form of storing data. This is because **everything is connected!** Hence, Graphs are very useful for modeling and solving a wide variety of real-world problems.\r\n \r\nWhile NetworkX is amazing for getting started with Graphs, the library encounters bottlenecks in performance at scale.\r\n\r\nIs there a solution out there for users who want more performance from NX and also Open-Source developers who want to implement fast algorithms? Yes! Thanks to the magic of dispatching.\r\n\r\nNetworkX now supports dispatching to various backends, including the GPU accelerated cuGraph library by Nvidia RAPIDS.\r\n\r\nAttend this talk to learn about how you can use nx-cugraph \u2013 the cuGraph-powered backend for NetworkX \u2013 and how it unlocks exciting new possibilities for you to solve real-world graph analytics problems.", "description": "This talk will showcase a GPU accelerated graph backend presented by NVIDIA in partnership with the NetworkX Community. It aims to showcase how GPUs are well-suited to solving graph problems at large scales.\r\n\r\nThe talk is intended for Python developers who are interested in using GPUs in their workflows and data scientists interested in Graph analytics.\r\n\r\nDuring the talk, we intend to go over the following.\r\n\r\n1. Brief introduction to Graphs and why Graph Analytics is so powerful.\r\n\r\n2. Introducing NetworkX \u2013 Why is it so popular? What are its limitations?\r\n\r\n3. Example showcasing the magic of Dispatching. The design philosophy and how it benefits both users and OS developers.\r\n\r\n4. Real-world example on the Pokec (Social Network) dataset. How to do Community Detection on a large Graph using Louvain (with Zero Code Change)!\r\n\r\n5. Finally, how we aim to work with the community to add new algorithm implementations and contribute to upstream NetworkX.\r\n \r\n6. Q&A!\r\n\r\n===\r\n\r\nLearn more:\r\n\r\n - [Project page](https://rapids.ai/nx-cugraph/)!\r\n - [GitHub](https://github.com/rapidsai/nx-cugraph)!\r\n\r\nI'd love to connect with you and discuss ideas of applying Graph analytics to *your* work.\r\n\r\nReach out via [LinkedIn](https://www.linkedin.com/in/ralph-liu/)", "recording_license": "", "do_not_record": false, "persons": [{"code": "WSA7HJ", "name": "Ralph Liu", "avatar": null, "biography": "Ralph is currently a software engineer at NVIDIA, working on GPU-accelerated graph libraries ([cuGraph](https://github.com/rapidsai/cugraph), [nx-cugraph](https://rapids.ai/nx-cugraph/)) as a part of [RAPIDS](https://rapids.ai/).", "public_name": "Ralph Liu", "guid": "219530b4-fe2b-5fe9-aa40-70d811cc72fc", "url": "https://cfp.pydata.org/virginia2025/speaker/WSA7HJ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/UDQZBM/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/UDQZBM/", "attachments": []}, {"guid": "86397d18-d4a9-54d2-adf4-adfbfaf9d4c0", "code": "XRXKDK", "id": 77170, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/XRXKDK/Gemini_Generated_Image_JRnoubv.jpeg", "date": "2025-04-18T12:05:00-04:00", "start": "12:05", "duration": "00:30", "room": "Auditorium 4", "slug": "virginia2025-77170-practical-multi-armed-bandits", "url": "https://cfp.pydata.org/virginia2025/talk/XRXKDK/", "title": "Practical Multi Armed Bandits", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Multi-armed bandits are a reinforcement learning tool often used in environments where the cost or rewards of different choices are unknown or where those functions may change over time. The good news is that as far as implementation goes, bandits are surprisingly easy to implement; however, in practice, the difficulty comes from defining a reward function that best targets your specific use case. In this talk, we will discuss how to use bandit algorithms effectively, taking note of practical strategies for experimental design and deployment of bandits in your applications.", "description": "Imagine a row of slot machines (often called one-armed bandits because of the lever on the side and the fact that they take your money) -- you know that one of them will pay out more than the others over time, but how do you figure out which one? This is the premise of the multi-armed bandit (MAB) problem, which has become a vital reinforcement learning technique used to balance the exploration-exploitation dilemma (e.g., at what point do you start exploiting the best choice to maximize your rewards instead of exploring for better options).\r\n\r\nMulti-armed bandits are straightforward to implement: define your choices and assign each of them a probability distribution for selection. Each time a choice is made, the probability distribution for that choice is updated based on the outcome of a reward function. Easy right? The trick is in designing both your choices and your reward function in such a way that you capture the dynamics of your experimental environment, often a live environment that involves user behavior and other irregularities! \r\n\r\nThings get more complicated when you have multiple agents - each of them with their own probability distributions. Here, you need to design the reward functions such that your desired behavior emerges from the collective interactions of each individual agent. The best type of complexity arises globally from many simple local interactions! \r\n\r\nIn this talk, we will learn how to implement multi-armed bandits and reward functions for three use cases: ordering a news feed, prioritizing tasks for a team in a sprint, and minimizing cloud costs for a distributed system. We'll focus on practical strategies for designing reward functions and dealing with change. At the end of this talk you should be ready and excited to implement bandit algorithms for your own data science problems!", "recording_license": "", "do_not_record": false, "persons": [{"code": "FBYRYV", "name": "Benjamin Bengfort", "avatar": null, "biography": "Dr. Benjamin Bengfort is the co-founder and CEO of Rotational Labs, where he orchestrates the integration of innovative machine learning techniques with advanced distributed computing systems.  A seasoned expert in systems engineering, programming, and data science, he has a proven record of developing AI-driven solutions that support globally distributed data architectures and address the complex challenges of multi-region organizations. Under his leadership, Rotational has focused on not just the implementation but also the responsibility of participating in an AI driven economy; a believer in open source, Dr. Bengfort pays special attention to the ethics and outcomes of AI, ensuring humans are at the center of our solutions. He is the co-author of Applied Text Analysis with Python (2018, O\u2019Reilly) and Data Analytics with Hadoop (2016, O\u2019Reilly). Dr. Bengfort earned his Ph.D. from the University of Maryland focusing on planetary scale distributed systems.", "public_name": "Benjamin Bengfort", "guid": "1c03ccb5-638a-5420-a1d3-fbadd649a09a", "url": "https://cfp.pydata.org/virginia2025/speaker/FBYRYV/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/XRXKDK/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/XRXKDK/", "attachments": []}, {"guid": "33a1881e-24b9-5167-8049-aa27ec81d382", "code": "AFZSVT", "id": 77497, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/AFZSVT/Screenshot_2025-02-13_1_jm3KaOo.png", "date": "2025-04-18T14:55:00-04:00", "start": "14:55", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77497-using-python-to-unlock-insights-from-openstreetmap-data-at-scale", "url": "https://cfp.pydata.org/virginia2025/talk/AFZSVT/", "title": "Using Python to Unlock Insights from OpenStreetMap Data at Scale", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Geospatial data can unlock valuable insights. OpenStreetMap includes electric power and telecommunication infrastructure geospatial data, and it is already \u201copen\u201d. This presentation will demonstrate how to use Python to \u201cunlock the insights\u201d available in OSM power and telecommunications geospatial data.", "description": "Commercial real estate organizations are avid consumers of geospatial data. These organizations have already identified the value in particular of power and telecommunications infrastructure spatial data to make business decisions. Examples of these data include: the locations of power plants, transmission lines, fiber backbone cables, and  submarine fiber cables.\r\n\r\nOne rich source for these datasets is OpenStreetMap (OSM), however natively OSM does not  streamline access to data, especially at scale. Because OSM data are open, we can use Python to query, download, and transform OSM power and telecommunications spatial data for use within Open Source and commercial Geographic Information Systems (GIS) software applications, models built in Python and other languages, and really any other tools and processes which can read GIS data. \r\n\r\nThis presentation will present a high-level overview of the overall data flow, and then dive into individual steps and how each step was implemented in Python. Examples will be provided, and maps and analyses based on the resulting spatial data will be demonstrated. This presentation will also explain one approach to download very large OSM datasets, for example data spanning continents and including many different themes. Along the way this presentation will also touch on how to avoid \u201cgotchas\u201d and how this approach could be adopted to different types of OSM data supporting other use cases and business requirements.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZSTFQ8", "name": "Cory Eicher", "avatar": "https://cfp.pydata.org/media/avatars/ZSTFQ8_OKhEL4y.webp", "biography": "Cory Eicher is the founder of Eichcorp, a software consulting and implementation practice based in Charlottesville, Virginia... Developer/Mapper/Reader/Biker/Hiker/Skier/Soccer-er", "public_name": "Cory Eicher", "guid": "94bcb0d9-e6c1-55ff-90d5-bcb86bb58cd6", "url": "https://cfp.pydata.org/virginia2025/speaker/ZSTFQ8/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/AFZSVT/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/AFZSVT/", "attachments": []}, {"guid": "664a9276-41cc-5125-bdef-f6acac6ebcb0", "code": "ECJWAP", "id": 77496, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/ECJWAP/icon_mEQ90e7_ki3Nw6z.png", "date": "2025-04-18T15:30:00-04:00", "start": "15:30", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77496-versioning-multimodal-data-metadata-beyond", "url": "https://cfp.pydata.org/virginia2025/talk/ECJWAP/", "title": "Versioning Multimodal Data: Metadata & Beyond", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "The team behind DVC has spent years tackling data versioning challenges. With the rise of AI, we\u2019ve seen new complexities emerge - especially with multimodal datasets like images, video, audio, and text. This talk shows why multimodal data versioning is different and how Pydantic provides a powerful way to structure and integrate metadata.", "description": "The team behind DVC has spent years tackling data versioning challenges. With the rise of AI, we\u2019ve seen new complexities emerge - especially with multimodal datasets like images, video, audio, and text. Simply tracking files is no longer enough; metadata, including bounding boxes, poses, text annotations, and embeddings, is now central to dataset management, using LLM for auto-annotation is becoming a daily routine. This talk shows why multimodal data versioning is different, how Pydantic provides a powerful way to structure and integrate metadata and how this approach is implemented in open-source library DataChain.\r\n\r\nWe\u2019ll also cover efficient dataset operations at scale: computing diffs across millions of files, managing expensive GPU-based metadata computations like embeddings and performing incremental dataset updates. The audience will learn practical tricks for building scalable, high-performance AI workflows with modern dataset management techniques.", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZBKX3G", "name": "Dmitry Petrov", "avatar": "https://cfp.pydata.org/media/avatars/ZBKX3G_hghXATA.webp", "biography": "Creator of open-source tool DVC. Ex-Data Scientist at Microsoft. PhD in Computer Science. Now co-founder of datachain.ai", "public_name": "Dmitry Petrov", "guid": "a2eee149-a95f-5a82-a073-c81c220d24e3", "url": "https://cfp.pydata.org/virginia2025/speaker/ZBKX3G/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/ECJWAP/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/ECJWAP/", "attachments": []}, {"guid": "b8821bfe-9be1-56a1-964f-4675e2630489", "code": "8M9ZJN", "id": 77137, "logo": null, "date": "2025-04-18T16:05:00-04:00", "start": "16:05", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77137-ai-ready-data", "url": "https://cfp.pydata.org/virginia2025/talk/8M9ZJN/", "title": "AI Ready Data", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In today\u2019s AI first era, customers expect data products to be deeply interconnected, consumable with minimal effort and widely available. The need for \u2018AI Ready Data\u2019, suitable for consumption directly by AI agents has never been clearer.", "description": "Customers have been clear that receiving \u2018just\u2019 data is no longer sufficient. They expect data to be immediately accessible, usable and understandable to both human and AI consumers with \u201czero ETL\u201d (Extract Transform Load). We will discuss the direction that S&P is taking explicitly aimed at serving this need, greatly increasing the insight available to customers. This includes the concept of providing machine readable metadata at the column level for datasets. This metadata permits AI and ETL tools to automatically ingest and connect delivered data to a customer\u2019s own data as well as automatically import that data into a customer\u2019s data catalog.", "recording_license": "", "do_not_record": false, "persons": [{"code": "E9JGNP", "name": "Alec Gosse", "avatar": null, "biography": "Alec is a Senior Director of Data Science at S&P Global Market Intelligence leading Ai efforts toward internal productivity.", "public_name": "Alec Gosse", "guid": "dc9977f3-6bc7-5e49-a182-294952880c2e", "url": "https://cfp.pydata.org/virginia2025/speaker/E9JGNP/"}, {"code": "3L7QJK", "name": "Hamish Brookeman", "avatar": "https://cfp.pydata.org/media/avatars/3L7QJK_4Ht0oh4.webp", "biography": "Hamish Brookeman \u2013 VP \u2013 Enterprise Data Architecture, S&P Global Enterprise Data Organization, S&P Global\r\n\r\nHamish is responsible for Enterprise Data Architecture, which is responsible for the overall design of managed data structures including strategies for data implementation, acquisition and maintenance and evaluating data sources for adherence to quality standards and ease of integration. The specific role is to capture data requirements clearly, completely and correctly, and represent them in a formal and visual way through the data models. In addition, making sure that data integration is based on a common metadata framework and that the integrated data is presented to the business as valid information.\r\n\r\nHamish previously served in a similar role for S&P Global Market Intelligence. Hamish joined S&P Global in 2015 via the SNL Financial acquisition where he had served as Head of Data Architecture since 2006.\r\n\r\nHamish has 25+ years of experience in technology leadership, large abstract datasets and highly engineered information systems. He has extensive knowledge of Structured, Semi-Structured and Unstructured data strategies. Hamish attended Princeton University where he studied Economics and Politics.", "public_name": "Hamish Brookeman", "guid": "1cb36498-ad3e-5232-973a-5c8c23de8b63", "url": "https://cfp.pydata.org/virginia2025/speaker/3L7QJK/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/8M9ZJN/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/8M9ZJN/", "attachments": []}, {"guid": "7c95b711-c6d9-59b9-84ca-0b7abd456cbc", "code": "FMQ8PA", "id": 77181, "logo": null, "date": "2025-04-18T16:40:00-04:00", "start": "16:40", "duration": "00:35", "room": "Auditorium 4", "slug": "virginia2025-77181-visualization-of-higher-dimensional-feature-spaces-during-model-training", "url": "https://cfp.pydata.org/virginia2025/talk/FMQ8PA/", "title": "Visualization of higher-dimensional feature spaces during model training", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Modern machine learning models typically utilize extremely high-dimensional feature spaces, which inhibits robustness and explainability. Finer-grained control over model training requires more powerful tools for observing and interacting with latent features as they evolve over time. In this talk, we give several examples of visualizations of nearest-neighbor graphs that illuminate common training pitfalls and provide practical insights for diagnosing model performance issues.", "description": "The goal of this talk is to provide machine learning practitioners with a few simple visualizations for more effective model training. These techniques have been developed through several years of real-world experience with model training, validation, deployment, and maintenance. Since the internal workings of large models are usually somewhat opaque, model trainers often ask themselves a familiar set of questions:  \r\n\r\nWhen should I stop training my model? \r\n\r\nWhich one of my saved model checkpoints is the \u201cbest\u201d? \r\n\r\nWhat training data should I add (or remove) to achieve a given outcome? \r\n\r\nHow do I know if my model is giving the right answer for the wrong reasons, or vice versa? \r\n\r\nHow robust is my model to out-of-distribution data? \r\n\r\nWhy is there performance drift in my deployed model? \r\n\r\nWe argue that much greater emphasis on model observability and explainability is needed, and that the right sorts of visualizations can generate valuable insights and point toward specific improvements.", "recording_license": "", "do_not_record": false, "persons": [{"code": "XENLDK", "name": "Vivek Dhand", "avatar": "https://cfp.pydata.org/media/avatars/XENLDK_Bbhw9LO.webp", "biography": "Vivek Dhand uses his background in pure mathematics to address complex real-world problems. He has led and contributed to several applied research projects involving data fusion, computer vision, and  natural language processing. He strives to develop robust and explainable systems with transparency and accountability, in order to minimize bias and protect individual privacy.\r\n\r\nVivek received his Ph.D. in mathematics from Northwestern University. His research interests include representation theory, category theory, algebraic combinatorics, and visualizations of mathematical structures.", "public_name": "Vivek Dhand", "guid": "52f43396-8463-5520-b2cc-d1ff66008e38", "url": "https://cfp.pydata.org/virginia2025/speaker/XENLDK/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/FMQ8PA/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/FMQ8PA/", "attachments": []}], "Auditorium 3": [{"guid": "2af63c1d-f407-5ad1-93fd-ad241c83c0bb", "code": "ZXYBV3", "id": 77457, "logo": null, "date": "2025-04-18T10:20:00-04:00", "start": "10:20", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77457-bayesian-risk-analysis-for-large-multi-modal-data", "url": "https://cfp.pydata.org/virginia2025/talk/ZXYBV3/", "title": "Bayesian Risk Analysis For Large Multi-Modal Data", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In the era of big data, multi-modal data from multiple sources or modalities has become increasingly prevalent in various fields such as healthcare. The National COVID Cohort Collaborative (N3C) provides researchers with abundant clinical data in different forms by aggregating and harmonizing Electronic Health Records (EHR) data across different clinical organizations in the United States, making it convenient for researchers to analyze COVID-related topics and build models with large multimodal data. Bayesian risk analysis has advantages in handling the complexities and heterogeneities of multi-modal healthcare data, specifically in cohort studies when researchers try to answer questions of interest in public health or medicine field regarding COVID and Long COVID.", "description": "This talk is based on research projects by UVA iTHRIV on the N3C platform. Its target audience includes data scientists, undergraduate students, graduate students, researchers, and anyone interested in data science. The general structure of this talk will consist of a brief introduction to The National COVID Cohort Collaborative (N3C), a database with multi-modal data sets, quantitative methods and models in Bayesian risk analysis, and some real-world applications of these methods as well as some publications by our team. This talk will be informative and will include a balanced percentage of mathematical expressions and real-world applications, and the audience will learn more about quantitative methods to analyze multi-modal data in N3C.", "recording_license": "", "do_not_record": false, "persons": [{"code": "E3MSCH", "name": "Sihang Jiang", "avatar": "https://cfp.pydata.org/media/avatars/E3MSCH_pRjoYDl.webp", "biography": "Sihang Jiang is a PhD candidate at University of Virginia in systems engineering, and his research interests include Bayesian machine learning, Markov Chain Monte Carlo, AI for health, and natural language processing.", "public_name": "Sihang Jiang", "guid": "efdf8ca7-8bdd-5077-910a-5736d5c72b37", "url": "https://cfp.pydata.org/virginia2025/speaker/E3MSCH/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/ZXYBV3/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/ZXYBV3/", "attachments": []}, {"guid": "cad6b9f3-8383-5bb5-9cdc-e89b4e2438df", "code": "GLBTZD", "id": 77218, "logo": null, "date": "2025-04-18T10:55:00-04:00", "start": "10:55", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77218-saving-lives-with-data-science-how-data-science-shortened-the-covid-19-pandemic-by-2-months", "url": "https://cfp.pydata.org/virginia2025/talk/GLBTZD/", "title": "Saving Lives with Data Science: How data science shortened the COVID-19 pandemic by 2 months", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "When every day counted during the COVID-19 pandemic, data science became an essential catalyst in accelerating the path to widespread vaccination. This talk delves into the data-driven strategies that enabled the U.S. government\u2019s vaccine trials to move faster, cutting crucial weeks\u20146 to 8, by our estimates\u2014off the timeline to deployment. Through sophisticated geospatial modeling, we identified and swiftly mobilized trial recruitment efforts in emerging hot zones, ensuring that each candidate pool was both numerically sufficient and demographically representative. Attendees will discover how advanced analytics, predictive modeling, and interdisciplinary collaboration converged to target the right communities at the right time, ultimately expediting vaccine availability. This behind-the-scenes look at rapid-response data science highlights not just the technical innovations, but the decisive cultural and operational shifts that turned real-time insights into life-saving action.", "description": "This talk explores how data science accelerated COVID-19 vaccine trials, saving 6-8 weeks in deployment. Through geospatial modeling, we targeted diverse recruitment in emerging hot zones, ensuring efficient and representative trials. Attendees will discover how advanced analytics and collaboration turned insights into life-saving action.", "recording_license": "", "do_not_record": false, "persons": [{"code": "SNLNFQ", "name": "Greg Michaelson", "avatar": "https://cfp.pydata.org/media/avatars/SNLNFQ_SdI8sjK.webp", "biography": "Greg Michaelson is Cofounder and Chief Product Officer at Zerve, a young, stealthy startup that\u2019s rethinking the data science development experience. Previously, Greg was an early joiner at DataRobot where he played many roles, including Chief Customer Officer. Prior to that, he worked as a data scientist in the financial sector after earning a PhD in applied statistics from the University of Alabama. In his spare time, Greg manufactures a line of flavored breakfast cereal toppings called Cerup. He lives in Spring Creek, Nevada with his wife, four children, and two Clumber Spaniels.", "public_name": "Greg Michaelson", "guid": "33cd971f-1681-5cde-9541-d664481faba5", "url": "https://cfp.pydata.org/virginia2025/speaker/SNLNFQ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/GLBTZD/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/GLBTZD/", "attachments": []}, {"guid": "7821c06f-67ed-5379-99a1-bf7f8e47df5a", "code": "PG9CKX", "id": 77122, "logo": null, "date": "2025-04-18T11:30:00-04:00", "start": "11:30", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77122-the-art-of-brain-data-in-asd-subjects-celebrating-neurodiversity-through-aesthetic-data-visualization", "url": "https://cfp.pydata.org/virginia2025/talk/PG9CKX/", "title": "The Art of Brain Data in ASD Subjects: Celebrating Neurodiversity Through Aesthetic Data Visualization", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In our project, we took MRI-derived brain data and reinterpreted it through an aesthetic lens. Using multidimensional scaling (MDS) to distill complex patterns in cortical anatomy, we transformed these insights into physical 3D-printed brain models. Each sculpture serves as a tangible narrative, celebrating both the subtle and striking differences between male and female brains, whether neurotypical or affected by ASD.", "description": "Historically, research has highlighted a notable disparity in ASD diagnoses\u2014with males being diagnosed significantly more frequently than females. However, beneath these statistics lies a rich tapestry of neuroanatomical diversity that often goes unnoticed. Our work reimagines this disparity as a piece of art, where data becomes a sculptural medium inviting viewers to engage with and reflect on the intricacies of brain structure.\r\n\r\nDrawing on over 300 3D brain surface models from the Autism Centers of Excellence (ACE) study, our approach blends advanced MRI neuroimaging, multivariate statistical analysis, and cutting-edge 3D printing technology. The result is an artful representation that not only quantifies but also visually and tangibly celebrates sex differences in brain morphology across both ASD and non-ASD populations.\r\n\r\nThis presentation will take you on a journey through our methodological and creative process\u2014from the acquisition and analysis of complex neuroimaging data to the transformation of these insights into physical art. We will discuss the technical details of MRI scanning, the challenges and innovations in our multivariate analyses, and the craftsmanship behind the 3D printing process.\r\n\r\nDesigned for an audience spanning both scientific and artistic disciplines, this presentation aims to inspire new ways of thinking about data visualization. By embracing \"data as art,\" we encourage a more holistic understanding of neurodiversity\u2014one that not only informs but also resonates on an emotional and aesthetic level. Join us for this presentation as we explore how the fusion of art and science can lead to innovative insights into the human brain, fostering a deeper appreciation for the nuanced interplay of sex differences in ASD and beyond.", "recording_license": "", "do_not_record": false, "persons": [{"code": "B78YRC", "name": "Siwen Liao", "avatar": "https://cfp.pydata.org/media/avatars/B78YRC_hXALUYG.webp", "biography": "Siwen Liao is a second-year undergraduate student at the University of Virginia studying statistics and physics. Her academic interests focus on applying data science and quantitative methods to medicine and healthcare.", "public_name": "Siwen Liao", "guid": "8537faf4-3cc1-579d-9122-cac817242b7d", "url": "https://cfp.pydata.org/virginia2025/speaker/B78YRC/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/PG9CKX/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/PG9CKX/", "attachments": []}, {"guid": "1ac5a959-e44d-5c6f-83e8-31e17ad280ba", "code": "CF3VVT", "id": 77199, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/CF3VVT/evictions_filed_0VbNf8J_A7PSGEl.png", "date": "2025-04-18T12:05:00-04:00", "start": "12:05", "duration": "00:30", "room": "Auditorium 3", "slug": "virginia2025-77199-exploring-eviction-trends-in-virginia", "url": "https://cfp.pydata.org/virginia2025/talk/CF3VVT/", "title": "Exploring Eviction Trends in Virginia", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Where do landlords engage in more eviction actions? What characteristics of renters or landlords increase the practice of serial filing? There is widespread interest in using administrative data -- information collected by government and agencies in the implementation of public programs -- to evaluate systems and promote most just outcomes. Working with the Civil Court Data Initiative of Legal Services Corporation, we use data collected from civil court records in Virginia to analyze the behavior of landlords. Expanding on our Virginia Evictors Catalog, we use data on court evictions to build additional data tools to support the work of legal and housing advocates and model key eviction outcomes to contribute to our understanding of landlord behavior.", "description": "Virginia is home to 5 of the top 10 cities with the highest rates of eviction nationwide. Housing instability threatens the security of entire communities and burdens already limited social safety nets. Yet research shows that housing instability is rooted not in individual or community failures, but in policies of exclusion, displacement, disinvestment, and discrimination.\r\n\r\nWhile collected to support programmatic goals, administrative data can also be used to shift the lens to those in power. In this work we first visualize eviction activity across the Commonwealth in an interactive Shiny app to address questions and needs of organizations providing legal, policy, and community advocacy. In addition we estimate landlord actions \u2013 eviction filings and serial filings \u2013 as a function of community and landlord characteristics. Using a series of mixed-effects models, with data aggregated to zipcodes nested in counties, we estimate the impact of community characteristics and landlord attributes on the likelihood of eviction filings and nuisance filings.  Both the app and analysis speak to the larger causes and consequences of housing instability.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CMJNN9", "name": "Samantha Toet", "avatar": "https://cfp.pydata.org/media/avatars/CMJNN9_AWEzCDO.webp", "biography": "Samantha grew up in Charlottesville and earned her bachelor's degree in social psychology research from UVA. She previously worked as a Solutions Engineer and Partnership Manager at RStudio, with a focus on open-source technology advocacy. She believes that everyone should be able to make informed, data-driven decisions regardless of their means, and is passionate about enabling her community with tools to support more equitable and accessible analytics. She currently serves as a Data Scientist at the Virginia Equity Center.", "public_name": "Samantha Toet", "guid": "c92d1186-1822-516b-9d7d-6203f3339018", "url": "https://cfp.pydata.org/virginia2025/speaker/CMJNN9/"}, {"code": "MBXUDZ", "name": "Dr. Michele Claibourn", "avatar": null, "biography": "As the Director of Equitable Analysis, Michele Claibourn leads the UVA Equity Center\u2019s community-engaged data science work in support of a more equitable and just region. Michele works to connect the developing data expertise of UVA students to the community as well through her faculty appointment in the Batten School of Leadership and Public Policy, where she teaches courses on Imagining Equitable Policy and Public Interest Data: Ethics and Practice, and a courtesy appointment in the School of Data Science, where she helped launch a Community Data Fellows program.", "public_name": "Dr. Michele Claibourn", "guid": "eda5c56e-4202-55bc-84d8-e39b81822a20", "url": "https://cfp.pydata.org/virginia2025/speaker/MBXUDZ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/CF3VVT/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/CF3VVT/", "attachments": []}, {"guid": "661db1df-6324-552c-90c5-4b5ca9eb0473", "code": "NNXPCL", "id": 77521, "logo": null, "date": "2025-04-18T12:35:00-04:00", "start": "12:35", "duration": "01:00", "room": "Auditorium 3", "slug": "virginia2025-77521-author-chat-book-signing", "url": "https://cfp.pydata.org/virginia2025/talk/NNXPCL/", "title": "Author Chat & Book Signing", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Lunchtime chat with data science authors, with some offering book giveaways and signing books!", "description": "Come meet the authors of some of your favorite data science books, or learn more about a book you're interested in but haven't purchased, yet. \r\n\r\nThe authors listed below will be available during lunch for informal discussions, so drop in any time during the lunch break for a meet & greet. Some authors will be signing books, so bring your books written by these authors if you want your copy autographed! (And check this schedule again before Friday, as we may have authors joining this session up until the day before the event.) Some limited copies may be available as giveaways.\r\n\r\nWill Ayd: Pandas Cookbook, Third Edition (Packt)\r\n\r\nSuhas Pai: Designing Large Language Model Applications (O'Reilly)\r\n\r\nRenee M. P. Teate: SQL for Data Scientists (Wiley)\r\n\r\nMatt Topol: In-Memory Analytics with Apache Arrow (Packt)\r\n\r\n\r\nNote that author John Berryman will be presenting a tutorial on Saturday, and will be available during lunchtime on Saturday to chat about his book \"Prompt Engineering for LLMs: The Art and Science of Building Large Language Model-Based Applications\" (O'Reilly).", "recording_license": "", "do_not_record": false, "persons": [{"code": "YYSMTU", "name": "William Ayd", "avatar": "https://cfp.pydata.org/media/avatars/YYSMTU_ZDbJlqh.webp", "biography": "Will Ayd is the author of the Pandas Cookbook, Third Edition, and has served as a maintainer of the pandas project since 2018. Will is also a Committer to the Apache Arrow project, and has helped improve countless more open source data libraries.\r\n\r\nIn his day job, Will helps clients in the Retail and Apparel spaces optimize cloud data platforms in AWS and GCP, while also providing strategy and training around the use of open source technology in enterprise settings.", "public_name": "William Ayd", "guid": "5cb82441-867f-56be-9672-d716e1da7ce7", "url": "https://cfp.pydata.org/virginia2025/speaker/YYSMTU/"}, {"code": "ESKHPZ", "name": "Matthew Topol", "avatar": "https://cfp.pydata.org/media/avatars/ESKHPZ_7yQEC7q.webp", "biography": "Hailing from the faraway land of Brentwood, NY and currently residing in the rolling hills of Connecticut, Matt Topol has always been passionate about software. After graduating from Brooklyn Polytechnic (now NYU-Poly), he joined FactSet Research Systems, Inc. in 2009 developing financial software. In the time since, Matt has worked in infrastructure and application development, has lead development teams, and architected large-scale distributed systems for processing analytics on financial data. Matt is a PMC member for the Apache Arrow project, frequently enhancing the Golang library among other enhancements and helping to grow the Arrow Community. Recently, Matt wrote the first and only book on Apache Arrow \"In-Memory Analytics with Apache Arrow\" and joined Voltron Data in order to work on the Apache Arrow libraries full time and grow the Arrow Golang community.\r\n\r\nIn his spare time, Matt likes to bash his head against a keyboard, develop/run delightfully demented games of fantasy for his victims--er--friends, and share his knowledge with anyone interested who'll listen to his rants.", "public_name": "Matthew Topol", "guid": "1b97e2ad-76ee-53ac-bf3b-f8e294851f66", "url": "https://cfp.pydata.org/virginia2025/speaker/ESKHPZ/"}, {"code": "B3XXAU", "name": "Renee Teate", "avatar": "https://cfp.pydata.org/media/avatars/B3XXAU_Rh19V2L.webp", "biography": "Renee Teate is the Senior Director of Data Science at higher ed tech company HelioCampus and author of SQL for Data Scientists (Wiley). Many people know her as the host of the Becoming a Data Scientist Podcast, or as \"Data Science Renee\" from BlueSky (previously becomingdatasci on Twitter).\r\n\r\nRenee lives in Harrisonburg, VA, and is a graduate of JMU and UVA. She has worked with data her entire career, as a database designer, data analyst, data scientist, and director. She enjoys chatting with people looking to \"break into\" data careers, or looking to build their data science network.", "public_name": "Renee Teate", "guid": "ed6a1eeb-d9d2-599c-be91-bb7af22f10ab", "url": "https://cfp.pydata.org/virginia2025/speaker/B3XXAU/"}, {"code": "EDBNWF", "name": "Suhas Pai", "avatar": "https://cfp.pydata.org/media/avatars/EDBNWF_m3PGbrN.webp", "biography": "Suhas Pai is a NLP researcher and co-founder/CTO at Hudson Labs, a Toronto based Y-combinator backed startup. He is the author of the book 'Designing Large Language Model Applications', published by O'Reilly Media. He has contributed to the development of several open-source LLMs, including being the co-lead of the Privacy working group at BigScience, as part of the BLOOM LLM project. Suhas is active in the ML community, being Chair of the TMLS (Toronto Machine Learning Summit) conference since 2021. He is also a frequent speaker at AI conferences worldwide, and hosts regular seminars discussing the latest research in the field of NLP.", "public_name": "Suhas Pai", "guid": "3f65abdd-e84d-5b42-ae41-eaef6b04b8d8", "url": "https://cfp.pydata.org/virginia2025/speaker/EDBNWF/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/NNXPCL/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/NNXPCL/", "attachments": []}, {"guid": "ba6e91e6-f42c-5849-a477-6a6ea82d28d2", "code": "L3GESN", "id": 77498, "logo": null, "date": "2025-04-18T14:55:00-04:00", "start": "14:55", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77498-using-changepoint-and-bayesian-analysis-to-drive-safety-improvements-in-mining", "url": "https://cfp.pydata.org/virginia2025/talk/L3GESN/", "title": "Using Changepoint and Bayesian Analysis to Drive Safety Improvements in Mining", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "In the mining industry's pursuit of zero harm, distinguishing real safety improvements from random variation is crucial yet challenging. This talk demonstrates how classical changepoint analysis and Bayesian methods provide safety teams at Asarco LLC with rigorous tools to objectively evaluate progress towards our zero-harm goal. Using near miss reporting and lost time metrics, we will show how these statistical approaches help identify meaningful trends while avoiding misleading conclusions from natural variation. While the focus is on mining, these methods are applicable to other safety-critical and data-limited scenarios. No prior experience with changepoint analysis is required.", "description": "The presentation will cover how changepoint analysis is implemented, how the insights generated are applied to improve the safety metrics, and the challenges we have faced in communicating the insights. It will be structured as follows:\r\n\u2022\tUnderstanding variability in the process (5 min): How random variation impacts safety metrics and challenges in measuring zero-harm.\r\n\u2022\tChangepoint analysis implementation (10 min): Introduction to changepoint analysis using the changepoint package from R and Bayesian changepoint using the RBeast package from Python.\r\n\u2022\tCommunicating the insights (10 min): Challenges in communicating the insights and presenting them in a way that is actionable for the safety team and executives.\r\n\u2022\tQ&A (5-10 min): Open discussion and audience questions.\r\nAttendees will learn:\r\n\u2022\tWhy comparing absolute numbers might be misleading.\r\n\u2022\tHow to implement changepoint analysis to detect significant changes in safety metrics.\r\n\u2022\tStrategies to communicate actionable findings in non-data science teams and executive level.\r\nThis session is ideal for data practitioners with a background in basic probability and statistics (e.g., understanding distributions and confidence intervals). No programming expertise is required, but references to Python libraries and code snippets will provide actionable insights for those looking to implement these techniques in their work.", "recording_license": "", "do_not_record": false, "persons": [{"code": "CUXPXZ", "name": "Mauricio Mathey", "avatar": "https://cfp.pydata.org/media/avatars/CUXPXZ_Kj0PZZN.webp", "biography": "Mauricio is the leader of the Advanced Analytics Group at Asarco LLC, a subsidiary of Grupo Mexico, where he leverages AI/ML to drive improvements in costs, productivity, and safety. With over 7 years of experience across Latin America and the US, Mauricio has a proven track record in consulting and applying advanced analytics to solve complex business challenges. Prior to joining Asarco, he led commercial strategy analytics projects at EY-Parthenon. Mauricio has a Ms. in Data Science and an MBA from the University of Virginia and holds a Bs. in Industrial Engineering from the University of Lima.", "public_name": "Mauricio Mathey", "guid": "10f630ac-8372-5c12-bd3e-ddaadcec0fa4", "url": "https://cfp.pydata.org/virginia2025/speaker/CUXPXZ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/L3GESN/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/L3GESN/", "attachments": []}, {"guid": "76ce2f38-66e0-5d21-8cb8-a1be04b02b65", "code": "WRJYDF", "id": 77461, "logo": null, "date": "2025-04-18T15:30:00-04:00", "start": "15:30", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77461-the-secret-sauce-of-customer-satisfaction-turning-data-pipelines-into-data-products", "url": "https://cfp.pydata.org/virginia2025/talk/WRJYDF/", "title": "The Secret Sauce of Customer Satisfaction: Turning Data Pipelines into Data Products", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "What comes to mind when you think of an exceptional customer experience? Whether it was a \"peak experience\" or a \"dumpster fire\", it stuck with you! We recognize the importance of great customer experiences in industries like retail and hospitality\u2014but what about in data? Does long-term success depend on creating exceptional customer experiences, or are client expectations just challenges to manage?\r\n\r\nIn this session we will share insights from a data and analytics project Elder Research is implementing for a Quick-Service Restaurant corporation. By prioritizing the customer experience and embracing a \"Data as a Product\" mindset, data teams can drive greater business value and build stronger, more sustainable client relationships.", "description": "Since 2023, Elder Research has partnered with a major U.S.-based Quick Service Restaurant corporation to enhance the effectiveness of their enterprise data & analytics group. Our goal was to instill a \"Data as a Product\" mindset across six Data Portfolios, which support internal analytics teams by maintaining core data pipelines for critical business and customer-facing applications.\r\n\r\nIn this talk, we\u2019ll share key insights from the work by our technical business analysts and data engineers on this project, highlight the business value delivered to our client, and explore how \"Data as a Product\" principles can strengthen client relationships for all of us.", "recording_license": "", "do_not_record": false, "persons": [{"code": "N8ZND8", "name": "Josh Fairchild", "avatar": null, "biography": "Josh brings a background in non-profit organizational leadership to data and analytics consulting. He is passionate about helping teams thrive by implementing best practices when it comes to change management, data governance, and process development. He is an alumnus of the University of Virginia, with a B.S. in Computer Engineering.", "public_name": "Josh Fairchild", "guid": "0e2e6a80-7f93-549d-9b02-92e7e6909324", "url": "https://cfp.pydata.org/virginia2025/speaker/N8ZND8/"}, {"code": "MUPKGV", "name": "Liam Agnew", "avatar": null, "biography": "Liam brings a background in multidisciplinary product R&D and project management to approach data engineering challenges with creative, yet structured, solutions. He has experience with Python, MongoDB, C++, Java, JavaScript, and mobile app development. He is an alumnus of the University of Virginia, with a B.S. in Mechanical Engineering and minor in Materials Science and Engineering.", "public_name": "Liam Agnew", "guid": "b774337a-e520-5486-a086-22a50327a982", "url": "https://cfp.pydata.org/virginia2025/speaker/MUPKGV/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/WRJYDF/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/WRJYDF/", "attachments": []}, {"guid": "11132e35-4695-59cd-b850-ebbad8601334", "code": "9BTPLD", "id": 77128, "logo": null, "date": "2025-04-18T16:05:00-04:00", "start": "16:05", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77128-machine-learning-pipelines-in-higher-education-lessons-learned-taking-models-from-training-to-production", "url": "https://cfp.pydata.org/virginia2025/talk/9BTPLD/", "title": "Machine Learning Pipelines in Higher Education: Lessons Learned Taking Models From Training to Production", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "Building machine learning models with live, human-centric data is often a messy endeavor. However, by thinking about the entire machine learning pipeline and the lifecycle of the population being modeled we can prevent the model (and data scientist) from overpromising and underdelivering. Come learn about potential pitfalls that occur when working with human-centric data and what you can do to prevent it from ruining your model performance.", "description": "In this talk, we will discuss some lessons learned working on human-centric data in higher education and the pitfalls you may encounter. The higher education student cycle begins with admissions, follows the student throughout the terms they attend, and ideally ends with graduation. Using this student lifecycle as a guide, we will dive into how the data available at each point of the student lifecycle and machine learning pipeline needs to be accounted for during training to prevent failures in production. We will also discuss how working with operational datasets provides unique limits to our models and what to watch out for.\r\n\r\nThis talk is geared towards a general audience, though familiarity with machine learning will be helpful. \r\n\r\nOutline:\r\n\r\nIntroduction to the student lifecycle (5 min)\r\n\r\nIntroduction to machine learning pipelines (5 min)\r\n\r\nWorking with data from across the student lifecycle (10 min)\r\n\r\nWorking with operational datasets for a machine learning model (5 min)\r\n\r\nConcluding thoughts and Q&A (5 min)", "recording_license": "", "do_not_record": false, "persons": [{"code": "8M7CWC", "name": "Brian Richards", "avatar": null, "biography": "Brian Richards is a Senior Data Scientist at HelioCampus and works with data across the higher education student lifecycle to help colleges and universities better understand their students and support them through graduation. Brian also has an interest in exploring model evaluation techniques and helping end users better understand how their models work.", "public_name": "Brian Richards", "guid": "d4a7876e-df06-53f0-8e6b-8af5bab45205", "url": "https://cfp.pydata.org/virginia2025/speaker/8M7CWC/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/9BTPLD/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/9BTPLD/", "attachments": []}, {"guid": "80d0599a-4e2f-5e9c-b1e6-649ab62c4c03", "code": "RHCHVC", "id": 77411, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/RHCHVC/Screenshot_from_2025-02_wrkv5gr.png", "date": "2025-04-18T16:40:00-04:00", "start": "16:40", "duration": "00:35", "room": "Auditorium 3", "slug": "virginia2025-77411-what-is-geometric-algebra-and-can-it-help-me", "url": "https://cfp.pydata.org/virginia2025/talk/RHCHVC/", "title": "What is Geometric Algebra and can it help me?", "subtitle": "", "track": null, "type": "Talk", "language": "en", "abstract": "An introduction to Geometric Algebra, with a focus on how it can (and can't) be used as a practical computational tool in Python. The discussion will present concrete examples which make use of the open source python library \u2018Kingdon\u2019. The audience should leave with a grasp of what GA is and what it isn't,  so that they can decide if it is a tool worthy of their cognitive investment.", "description": "Geometric Algebra (GA) is a mathematical language that has recently received significant attention from the computer graphics and engineering communities. Proponents of GA claim that it provides a geometrically intuitive interface, concise syntax, and the ability to unify several of the most important algebras. This talk will discuss the pros and cons of GA as a practical computational tool in Python data science. The first half of the talk will introduce the concepts of GA, and the second half will provide concrete demonstrations with the Kingdon library. \r\nWhile geared toward data scientists, this talk can be enjoyed by anyone interested in applied mathematics. A basic background in linear algebra will be helpful. Additionally,  those using vector algebra, complex numbers, quaternions, rotation matrices and the like will be especially interested. The audience should leave with a grasp of what GA is and what it isn't,  so that they can decide if it is a tool worthy of their cognitive investment.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3X9A8J", "name": "Alex Arsenovic", "avatar": "https://cfp.pydata.org/media/avatars/3X9A8J_CVHpAqK.webp", "biography": "Alex has worked as a data scientist, library builder, and math enthusiast with over 12 years of experience. He holds a B.S. and Ph.D. in Electrical Engineering from the University of Virginia (2007, 2012), where he specialized in microwave systems and applied mathematics. He founded Eight Ten Labs (810lab.com) in 2016, and has developed and maintained two widely adopted open-source Python libraries, scikit-rf and clifford,  has authored over 25 scientific papers,  and holds a U.S. patent (No. 10459018) in electronic measurement systems.", "public_name": "Alex Arsenovic", "guid": "92f39776-87b6-5590-86b1-a9aa5fa5d13a", "url": "https://cfp.pydata.org/virginia2025/speaker/3X9A8J/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/RHCHVC/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/RHCHVC/", "attachments": []}]}}, {"index": 2, "date": "2025-04-19", "day_start": "2025-04-19T04:00:00-04:00", "day_end": "2025-04-20T03:59:00-04:00", "rooms": {"Room 120": [{"guid": "c992a846-52e1-5621-98b9-079469851943", "code": "7EUB8R", "id": 77106, "logo": null, "date": "2025-04-19T09:00:00-04:00", "start": "09:00", "duration": "01:30", "room": "Room 120", "slug": "virginia2025-77106-mastering-llms-from-prompt-engineering-to-agentic-ai", "url": "https://cfp.pydata.org/virginia2025/talk/7EUB8R/", "title": "Mastering LLMs: From Prompt Engineering to Agentic AI", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "This workshop will provide a comprehensive introduction to Large Language Models (LLMs), covering their capabilities, structure, and practical applications. Participants will learn prompt engineering techniques, retrieval-augmented generation (RAG), agentic AI design, fine-tuning strategies, and model evaluation methods. The session will conclude with a discussion on the future of AI-powered reasoning machines.", "description": "The rapid evolution of AI and Large Language Models (LLMs) has opened new possibilities for automation, content generation, and interactive agents. This hands-on workshop is designed for developers, researchers, and AI enthusiasts who want to deepen their understanding of LLMs and learn how to harness their full potential. Topics covered include:\r\n- How LLMs work and the role of reinforcement learning in training\r\n- The art and science of prompt engineering, including zero-shot and few-shot techniques\r\n- Retrieval-Augmented Generation (RAG) for integrating external knowledge\r\n- Agentic AI: Designing chatbots and workflow agents\r\n- Fine-tuning models using LoRA for custom behaviors\r\n- Evaluation methods for improving AI performance\r\n- Future trends, including multimodal models and new interaction paradigms\r\nAttendees will leave with practical skills, implementation strategies, and insights into the future of AI-powered applications.", "recording_license": "", "do_not_record": false, "persons": [{"code": "LMU9CL", "name": "John Berryman", "avatar": "https://cfp.pydata.org/media/avatars/LMU9CL_rMSsrnQ.webp", "biography": "John Berryman is the founder and principal consultant of Arcturus Labs, where he specializes in AI application development (Agency and RAG). As an early engineer on GitHub Copilot, John contributed to the development of its completions and chat functionalities, working at the forefront of AI-assisted coding tools. John is coauthor of \"Prompt Engineering for LLMs\" (O'Reilly).\r\n\r\nBefore his work on Copilot, John's focus was search technology. His diverse experience includes helping to develop next-generation search system for the US Patent Office, building search and recommendations for Eventbrite, and contributing to GitHub's code search infrastructure. John is also coauthor of Relevant Search (Manning), a book that distills his expertise in the field.\r\n\r\nJohn's unique background, spanning both cutting-edge AI applications and foundational search technologies, positions him at the forefront of innovation in LLM applications and information retrieval.", "public_name": "John Berryman", "guid": "7eb5192f-e45b-5049-8d28-da3aa914021b", "url": "https://cfp.pydata.org/virginia2025/speaker/LMU9CL/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/7EUB8R/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/7EUB8R/", "attachments": []}, {"guid": "49fc7589-8ab0-538d-a75d-0d71572f5f39", "code": "XPFPFE", "id": 77470, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/XPFPFE/Talk-flow-Diagram-V4_23_3dAtuJe.png", "date": "2025-04-19T11:00:00-04:00", "start": "11:00", "duration": "01:30", "room": "Room 120", "slug": "virginia2025-77470-building-rich-rag-systems-with-docling-unlock-information-from-tables-images-and-complex-documents", "url": "https://cfp.pydata.org/virginia2025/talk/XPFPFE/", "title": "Building Rich RAG Systems with Docling: Unlock Information from Tables, Images, and Complex Documents", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "Traditional PDF extraction tools often struggle with complex layouts, tables, and images, Docling (an opensource Python library developed at IBM) excels at extracting structured information from these elements, enabling the creation of richer, more accurate vector databases. This hands-on tutorial will guide participants through building a Retrieval Augmented Generation (RAG) system using Docling, an open-source document processing library.\r\n\r\n\r\nParticipants will learn how to harness Docling's advanced capabilities to build superior RAG systems that can understand and retrieve information from complex document elements that traditional tools might miss. Participants will learn how to handle complex documents, extract structured information, and create an efficient vector database for semantic search. The session will cover best practices for document parsing, chunking strategies, and integration with popular LLM frameworks.", "description": "### Overview and Objectives\r\nThis tutorial leverages Docling (https://ds4sd.github.io/docling/), a powerful open-source library designed for advanced document processing and AI integration. The session aims to equip data scientists and ML engineers with practical skills for building robust RAG systems by utilizing Docling's comprehensive feature set. We will work through scenarios with multi-page tables, research paper processing maintaining multi-column layouts and equations, or technical documentation management that understands code blocks and diagrams. Through these examples, you'll gain practical experience in building robust document processing pipelines that outperform traditional extraction tools.\r\n\r\nParticipants will learn how to:\r\n- Process and parse various document formats (PDF, DOCX, HTML) using Docling\r\n- Extract structured information including tables, formulas, and images\r\n- Implement effective text chunking strategies for optimal retrieval\r\n- Create vector databases for semantic search\r\n- Integrate the pipeline with LLM frameworks for end-to-end RAG solutions\r\n\r\n### Target Audience\r\nThis tutorial is designed for:\r\n- Data scientists and ML engineers working on document processing and LLM applications\r\n- Software developers implementing RAG systems\r\n- Anyone interested in building production-ready document processing pipelines\r\n\r\n**Experience Level:** Intermediate\r\n\r\n**Prerequisites:**\r\n- Basic Python programming knowledge\r\n- Familiarity with basic NLP concepts\r\n- Understanding of LLMs and vector databases (basic level)\r\n\r\n### Technical Requirements\r\nParticipants should have:\r\n- Python 3.10 or 3.11 installed\r\n- A code editor or IDE\r\n- Ability to install Python packages via pip\r\n- 4GB+ of free disk space for models and dependencies\r\n\r\n### Detailed Outline (90 minutes)\r\n\r\n1. Introduction and Setup (15 minutes)\r\n   - RAG system architecture overview\r\n   - Setting up the development environment\r\n   - Installing Docling and dependencies\r\n\r\n\r\n2. Document Processing with Docling (25 minutes)\r\n   - Understanding Docling's document processing capabilities\r\n   - Comparing traditional PDF extraction vs. Docling's advanced parsing\r\n   - Advanced extraction of tables, images, and complex layouts\r\n   - Hands-on exercise: Processing sample documents with rich content\r\n\r\n\r\n3. Building the RAG Pipeline (25 minutes)\r\n   - Creating rich vector embeddings that preserve document structure\r\n   - Integration with LLM frameworks\r\n   - Hands-on exercise: Building a complete RAG pipeline\r\n\r\n\r\n4. Best Practices and Production Considerations (15 minutes)\r\n   - Performance optimization techniques\r\n   - Using accelerators \r\n   - Docling-serve https://github.com/docling-project/docling-serve to deploy Docling as API service\r\n   - Creating effective evaluations\r\n\r\n\r\n\r\n5. Q&A and Interactive Problem Solving (10 minutes)\r\n   - Addressing participant questions\r\n   - Troubleshooting common issues\r\n   - Discussion of real-world applications\r\n\r\n\r\n### Materials\r\nhttps://github.com/KrishnaRekapalli/docling-rag-tutorial-pydata-2025\r\n\r\n### Pre-work\r\nMake sure that you have a Hugging Face access token / Replicate API key for LLM inference. You can get some free inference credit on both platforms without credit card. Other option is local ollama. For more details check https://github.com/KrishnaRekapalli/docling-rag-tutorial-pydata-2025\r\n\r\n### Key Takeaways\r\nParticipants will leave the tutorial with:\r\n- Practical experience in building RAG systems\r\n- Understanding of document processing best practices\r\n- Ability to extract and utilize information from complex document elements\r\n- Hands-on experience comparing traditional vs. advanced extraction methods\r\n- Knowledge of common pitfalls and how to avoid them\r\n- Strategies for handling tables and images in RAG systems", "recording_license": "", "do_not_record": false, "persons": [{"code": "AHXCFN", "name": "Krishna Rekapalli", "avatar": "https://cfp.pydata.org/media/avatars/AHXCFN_MF0uTub.webp", "biography": "Krishna is a Senior Data Scientist at IBM's Watsonx.ai Solution Architecture Center of Excellence, specializing in designing and implementing enterprise-scale LLM-powered AI solutions and agentic workflows. With over 7 years of experience building machine learning applications, they bring extensive expertise in hybrid cloud architectures, geospatial data analysis, and artificial intelligence. At IBM, they work directly with clients to architect and deploy production-ready AI solutions, focusing on practical implementation challenges and scalable architectures.", "public_name": "Krishna Rekapalli", "guid": "e094efbc-ccd4-5ca7-87c6-7545a2643d24", "url": "https://cfp.pydata.org/virginia2025/speaker/AHXCFN/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/XPFPFE/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/XPFPFE/", "attachments": []}, {"guid": "bf321c2c-fbdb-5eba-ab54-1a21925ed30a", "code": "3JXT7N", "id": 77472, "logo": null, "date": "2025-04-19T13:30:00-04:00", "start": "13:30", "duration": "01:30", "room": "Room 120", "slug": "virginia2025-77472-build-your-own-data-science-ai-agents", "url": "https://cfp.pydata.org/virginia2025/talk/3JXT7N/", "title": "Build Your Own Data Science AI Agents", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "When \u201cAI Agent\u201d became the buzz word, have you ever wondered: what exactly is an AI agent? What is the multi-agent system? And how can you use the power of AI agents in your **day-to-day data science workflow**? In this hands-on tutorial, I will introduce AI agents and demonstrate how to design, build, and manage a multi-agent system for your data science workflows. Participants will learn how to break down complex tasks, assign AI agents to collaborate effectively, and ensure accuracy and reliability in their outputs. We will also discuss the trade-offs, limitations, and best practices for incorporating AI agents into data science projects.", "description": "**Prerequisite**: \r\n1. OpenAI developer API Key. If you do not have one, here is a video to create an account and create the OpenAI API Key. https://www.youtube.com/watch?v=JuAOOO18ycg\r\n2. LangSmith API: https://smith.langchain.com/\r\n\r\n\r\n**Tutorial Materials**: find this Google Drive link: https://drive.google.com/drive/folders/1keoQYO6iEm_b9olxxcWgOfmpipProaPJ?usp=drive_link\r\n\r\nThis hands-on tutorial will guide participants through designing, building, and deploying AI agents to streamline data science tasks.\r\n\r\n**What You\u2019ll Learn**\r\nThis tutorial will provide a deep dive into AI agents and multi-agent systems, covering:\r\n- The role of AI agents in automating data science tasks such as data preprocessing, feature engineering, model selection, and evaluation.\r\n- How to design a multi-agent system that efficiently distributes tasks while ensuring reliability and accuracy.\r\n- Strategies for incorporating AI agents into everyday workflows to save time and enhance productivity.\r\n- Common challenges, trade-offs, and best practices when using AI agents in data science.\r\n\r\n**Tutorial Structure**\r\n1. Introduction to AI Agents in Data Science (15 minutes)\r\n- What are AI agents, and how do they fit into data science workflows?\r\n- Examples of AI-driven automation in data science.\r\n- Overview of multi-agent collaboration for data-related tasks.\r\n2. Setting Up the Development Environment (10 minutes)\r\n- Tools and frameworks for building AI agents in data science.\r\n- Accessing tutorial materials (Google Drive).\r\n3. Building an AI-Driven Data Science Workflow (40 minutes)\r\n- Hands-on implementation: Automating exploratory data analysis (EDA), data preprocessing, model training, and evaluation with AI agents.\r\n- Orchestrating agent collaboration for complex workflows.\r\n- Ensuring accuracy, reliability, and interpretability in AI-assisted data tasks.\r\n4. Challenges, Trade-offs, and Best Practices (15 minutes)\r\n5. Q&A and Wrap-Up (10 minutes)\r\n- Discussion on real-world applications and industry adoption.\r\n- Key takeaways and next steps for implementing AI agents in data projects.\r\n\r\n**Who Should Attend?**\r\nThis tutorial is designed for data analysts, data scientists, machine learning practitioners, and AI engineers looking to integrate AI agents into their workflows. Attendees should have a basic understanding of Python and machine learning concepts. \r\n\r\n**Prerequisites & Materials**\r\n- Skill Level: Intermediate (basic Python and ML knowledge recommended).\r\n- Resources: A Google Colab environment for hands-on execution (no local installation required).\r\n\r\nBy the end of this tutorial, participants will have a practical framework for using AI agents to automate and optimize data science workflows, improving efficiency and scalability in their projects.", "recording_license": "", "do_not_record": false, "persons": [{"code": "73ZGDW", "name": "Niharika Krishnan", "avatar": "https://cfp.pydata.org/media/avatars/73ZGDW_V5wEPg3.webp", "biography": "Niharika is a Machine Learning Engineer in NYC, working at the intersection of Quant Finance and AI. As a PyLadies organizer and WiDS ambassador, she fosters a community of women in NYC to collaborate and grow in the field of AI.", "public_name": "Niharika Krishnan", "guid": "a4cec0f7-2929-56cc-a497-43c126e83b37", "url": "https://cfp.pydata.org/virginia2025/speaker/73ZGDW/"}, {"code": "WQBZEL", "name": "Chuxin Liu", "avatar": "https://cfp.pydata.org/media/avatars/WQBZEL_hoxIMwH.webp", "biography": "Chuxin Liu, PhD holds a PhD in economics and current works as a senior quantitative associate at JPMorgan. Chuxin has worked with leading companies in the field and has been an active member in the data communities. She is the chapter lead for AICamp and an ambassador for Women in Data Science (WiDS). Chuxin's teaching experience spans from institutions like City University of New York to public conferences including Pydata.", "public_name": "Chuxin Liu", "guid": "34462cb4-3a4c-5c09-9424-8acaad844f22", "url": "https://cfp.pydata.org/virginia2025/speaker/WQBZEL/"}, {"code": "J9JHUQ", "name": "Astha Puri", "avatar": "https://cfp.pydata.org/media/avatars/J9JHUQ_DejmeJj.webp", "biography": "Astha is a Senior Data Scientist at CVS Health, where she leads the design of recommendation engines for digital platforms, helping customers discover the right products and enabling patients to access the appropriate health services and support. She specializes in home screen personalization, leveraging data-driven insights to enhance user experiences. With a strong background in the tech industry, she is now applying her expertise to transform and innovate within the healthcare sector.", "public_name": "Astha Puri", "guid": "3fb57d58-1488-5a66-8a22-cd0be7f7798f", "url": "https://cfp.pydata.org/virginia2025/speaker/J9JHUQ/"}, {"code": "YR8GXF", "name": "Michelle Rojas", "avatar": null, "biography": null, "public_name": "Michelle Rojas", "guid": "e76e50be-685a-574a-bcf3-f480d5ad7fc4", "url": "https://cfp.pydata.org/virginia2025/speaker/YR8GXF/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/3JXT7N/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/3JXT7N/", "attachments": []}, {"guid": "41d015f9-1411-5abd-a0b0-2a7e3002e004", "code": "LMBBBF", "id": 77491, "logo": null, "date": "2025-04-19T15:30:00-04:00", "start": "15:30", "duration": "01:30", "room": "Room 120", "slug": "virginia2025-77491-blazing-the-ai-trail-using-langgraph-to-conquer-the-oregon-trail", "url": "https://cfp.pydata.org/virginia2025/talk/LMBBBF/", "title": "Blazing the AI Trail: Using LangGraph to Conquer the Oregon Trail", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "Agents have become one of the most talked-about topics in the AI community, but much of the discussion focuses on its potential impact rather than practical implementation. This hands-on workshop will guide data scientists and engineers through building a complete workflow using LangGraph, and will show how to define custom tools, implement vector retrieval, leverage semantic caching, incorporate allow/block list routing, and structure model output for downstream consumption. In order to participate, attendees will need to have python (>=3.11), docker, an OpenAI api key, and the starter code for the project cloned.\r\n\r\n**Starter code**: https://github.com/redis-developer/agents-redis-lang-graph-workshop\r\n\r\n**Note**: participants can test their environment setup ahead of time by following the Readme and running `python test_setup.py` before the workshop.", "description": "Despite the growing excitement around AI agents, many practitioners lack clear guidance on how to implement them effectively. This workshop aims to bridge that gap by providing a structured, hands-on approach to building AI agent workflows with LangGraph. Participants will create an agent capable of playing the Oregon Trail and making in-game decisions, illustrating in a fun way not only how to implement agents but also when, why, and for what sorts of problems. \r\n\r\nSession outline:\r\n1. **Understanding Agent Workflows (10 min)**\r\n    - Overview of agentic workflows and their importance\r\n    - When and why to build agent workflows\r\n2. **Building a Basic LangGraph Agent (20 min)**\r\n    - Setting up the LangGraph framework\r\n    - Defining discrete operations with custom tools\r\n3. **Enhancing Agent Capabilities (20 min)**\r\n    - Structuring output for API interactions\r\n    - Implementing vector retrieval for RAG to improve contextual responses\r\n4. **Optimizing for Performance and Control (25 min)**\r\n    - Creating a semantic cache to reduce LLM latency and cost\r\n    - Implementing allow/block list routing for controlled execution\r\n5. **Review and Discuss (15 min)**\r\n    - Review what was just accomplished and why\r\n    - Discuss any design challenges or open debugging questions\r\n    - Open Q&A for questions related to best practice\r\n\r\nThis workshop has been tested with participants at a variety of levels and typically takes ~60 minutes to complete if environment setup has been confirmed as noted above.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3QSTMY", "name": "Robert Shelton", "avatar": "https://cfp.pydata.org/media/avatars/3QSTMY_LRXzbsf.webp", "biography": "Robert is an Applied AI Engineer at Redis, where he specializes in vector search and AI applications, supporting the development of the redisvl package and collaborating with a wide range of customers, from startups to enterprise organizations. His expertise spans diverse use cases, including financial chat applications, e-commerce recommendation systems, and more. Prior to Redis, Robert honed his skills as a Data Scientist and Full-Stack Engineer in the logistics industry, leading innovative projects that bridged software development and the complexities of physical goods movement.\r\n\r\nWhen he's not diving into AI and data challenges, you can find Robert enjoying the great outdoors\u2014most likely savoring some camp stove ramen along the Appalachian Trail in his native Virginia.", "public_name": "Robert Shelton", "guid": "ee180fa4-cfe0-560e-92f2-3de01f0c7518", "url": "https://cfp.pydata.org/virginia2025/speaker/3QSTMY/"}], "links": [{"title": "Starter code", "url": "https://github.com/redis-developer/agents-redis-lang-graph-workshop", "type": "related"}], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/LMBBBF/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/LMBBBF/", "attachments": []}], "Room 130": [{"guid": "3091d893-4c2b-5e86-920f-4066244f9840", "code": "HNWLPV", "id": 77069, "logo": null, "date": "2025-04-19T09:00:00-04:00", "start": "09:00", "duration": "01:30", "room": "Room 130", "slug": "virginia2025-77069-responsible-ai-with-scipy", "url": "https://cfp.pydata.org/virginia2025/talk/HNWLPV/", "title": "Responsible AI with SciPy", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "SciPy is a powerful library for scientific and technical computing in Python. The primary objectives of this presentation are to explore the core concepts of Responsible AI and to demonstrate these concepts with SciPy.", "description": "The tutorial provides an introduction to Responsible AI using SciPy.\r\n\r\nThis presentation will begin with an overview of Responsible AI concepts and of SciPy's core features. Following this, there will be a tutorial on how to implement Responsible AI concepts in SciPy. \r\n\r\nThe following items will be covered during the tutorial. \r\n\r\n- Data Processing and Validation \r\n- Bias Detection and Mitigation \r\n- Sensitivity Analysis \r\n- Explainability and Transparency \r\n\r\nEach topic will be demonstrated with examples, including links to extended tutorials featuring real-world applications from the healthcare industry.\r\n\r\nBy the end of this session, attendees will have a solid understanding of how to use SciPy for Responsible AI Applications. Additionally, they will be able to apply these concepts to their own projects immediately.", "recording_license": "", "do_not_record": false, "persons": [{"code": "JQRLY7", "name": "Andrea Hobby", "avatar": null, "biography": null, "public_name": "Andrea Hobby", "guid": "21ea07aa-8d9b-50b8-8754-adcfeaee8829", "url": "https://cfp.pydata.org/virginia2025/speaker/JQRLY7/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/HNWLPV/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/HNWLPV/", "attachments": []}, {"guid": "73f7add2-3522-5c16-b8e6-b18d88d48dad", "code": "RQCCPA", "id": 77489, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/RQCCPA/Dr._Kimberly_Deas_PyCon_qp84OZx.png", "date": "2025-04-19T11:00:00-04:00", "start": "11:00", "duration": "01:30", "room": "Room 130", "slug": "virginia2025-77489-data-viz-in-python-as-a-tool-to-study-hiv-health-disparities", "url": "https://cfp.pydata.org/virginia2025/talk/RQCCPA/", "title": "Data Viz in Python as a Tool to Study HIV Health Disparities", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "Health disparities remain a critical challenge in public health, demanding innovative approaches to uncover inequities and drive actionable change. This webinar will demonstrate how Python can serve as a powerful tool for creating data visualizations that illustrate the unequal burden of HIV across different populations. Participants will learn how Python\u2019s popular libraries, such as Matplotlib, Seaborn, and Plotly, can transform complex datasets into accessible, impactful visuals.\r\nUsing an HIV dataset containing demographic, geographic, and clinical variables, this session will guide attendees through a series of practical examples. From creating heatmaps and geospatial maps to analyzing temporal trends, the webinar emphasizes how to identify and communicate key social determinants related to race, gender, socioeconomic status, and access to care. Through hands-on demonstrations, attendees will see how Python\u2019s capabilities streamline data analysis and visualization workflows.\r\nKey takeaways from the session include identifying regions and communities in Texas, disproportionately affected by HIV, uncovering intersectional factors influencing health outcomes, and leveraging visual tools to inform policy and resource allocation. Special attention will be given to designing visuals that resonate with non-technical audiences, ensuring findings are actionable for public health professionals and policymakers.", "description": "Description: Data Viz in Python as a Tool to Study Health Disparities\r\n\r\nTargeted to the intermediate Python user, this session will begin with a brief overview of the tools and libraries that will be used, such as Pandas, Matplotlib, Seaborn, Plotly, and GeoPandas. Participants will do hands-on coding, exploring how to transform secondary data into practical, professional visuals. Key coding topics include:\r\n1.\tData Preprocessing and Exploration:\r\no\tAdvanced techniques in Pandas for cleaning and reshaping datasets, including handling missing data and filtering key variables.\r\no\tConducting exploratory data analysis (EDA) to uncover trends and patterns related to HIV disparities.\r\n\r\n2.\tBuilding Complex Visualizations:\r\no\tHeatmaps with Seaborn to visualize correlations between demographic factors and health outcomes.\r\no\tGeospatial maps using GeoPandas and Plotly to pinpoint regions with high HIV prevalence and disparities in care access.\r\no\tBar plots, stacked charts, and histograms to analyze outcomes across intersectional demographics.\r\no\tTime series plots using Matplotlib and Seaborn to explore temporal changes in HIV rates and interventions.\r\n\r\n3.\tNext Steps:\r\no\tShare Findings with Stakeholders: Present the visualizations and key insights to relevant stakeholders, such as public health officials, policymakers, healthcare providers, and community organizations, using clear and actionable language.\r\no\tDevelop Targeted Interventions: Use the insights from the analysis to design and propose interventions aimed at addressing identified disparities, such as community outreach programs, resource allocation strategies, or policy changes.\r\no\tMonitor and Evaluate Impact: Implement a plan to track the effectiveness of interventions using measurable outcomes, such as reductions in infection rates or improvements in access to care, and iterate on strategies based on the results.\r\no\tBuild Collaborative Partnerships: Partner with community organizations, research institutions, and funding agencies to amplify efforts, secure resources, and ensure sustained action to address health disparities over time.\r\n\r\nThis session will emphasize practical, hands-on coding, and participants are encouraged to follow along to develop scripts they can apply to their own datasets. By the end of the webinar, attendees will have a deeper understanding of how to use Python for data visualization and actionable insights in public health.", "recording_license": "", "do_not_record": false, "persons": [{"code": "KMJMDU", "name": "Dr. Kimberly Deas", "avatar": "https://cfp.pydata.org/media/avatars/KMJMDU_1WltctQ.webp", "biography": "With over 10 years of real world data (RWD) experience in Informatics, Biostatistics, Data Science, and Epidemiology, and over 20 years as a Scientist, Dr. Kimberly Deas is a currently a Principal Analytics Research Scientist Consultant. Her work experiences and specializations include healthcare informatics, health disparities, chemical and cancer informatics, and computational toxicology. Dr. Deas is a passionate Data educator, teaching data science, healthcare analytics, and data visualization at the collegiate level primarily through coding webinars. In her spare time, Dr. Deas enjoys golf, crocheting, walking, and reading for leisure.", "public_name": "Dr. Kimberly Deas", "guid": "3f827316-c333-5f44-9cb8-a32fe6538415", "url": "https://cfp.pydata.org/virginia2025/speaker/KMJMDU/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/RQCCPA/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/RQCCPA/", "attachments": []}, {"guid": "2497cab6-ed2e-508d-8e4f-b3009808eaf1", "code": "WAWAHD", "id": 77215, "logo": null, "date": "2025-04-19T13:30:00-04:00", "start": "13:30", "duration": "01:30", "room": "Room 130", "slug": "virginia2025-77215-getting-started-with-rapids-gpu-accelerated-data-science-for-pydata-users", "url": "https://cfp.pydata.org/virginia2025/talk/WAWAHD/", "title": "Getting Started with RAPIDS: GPU-Accelerated Data Science for PyData Users", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "In this introductory hands-on tutorial, participants will learn how to accelerate their data workflows with [RAPIDS](https://rapids.ai/), an open-source suite of libraries designed to leverage the power of [NVIDIA](https://www.nvidia.com/) GPUs for end-to-end data pipelines. Using familiar PyData APIs like **cuDF** (GPU-accelerated pandas) and **cuML** (GPU-accelerated machine learning), attendees will explore how to seamlessly integrate these tools into their existing workflows with minimal code changes, achieving significant speedups in tasks such as data processing and model training.", "description": "[NVIDIA](https://www.nvidia.com/) GPUs offer unmatched speed and efficiency for data processing and model training, significantly reducing the time and cost associated with these tasks. The appeal of GPUs becomes even stronger with zero-code-change libraries and plugins, allowing you to take advantage of GPU acceleration without having to rewrite your existing code. With [RAPIDS](https://rapids.ai/), you can use popular PyData libraries like **pandas**, **polars**, and **networkx** while reaping the performance benefits of GPUs.\r\n\r\nThis tutorial provides an introduction to **RAPIDS**, an open-source suite of libraries that accelerates data science and machine learning workflows using GPU technology. Aimed at data scientists and machine learning practitioners of all experience levels, the session will focus on how RAPIDS can be seamlessly integrated into existing data pipelines to achieve substantial performance improvements with minimal code changes.\r\n\r\nThrough hands-on coding exercises, attendees will explore the RAPIDS ecosystem, including **cuDF** (GPU-accelerated pandas) and **cuML** (GPU-accelerated machine learning), and learn how to integrate these tools into their workflows to accelerate tasks like data processing and model training. By the end of this tutorial, they'll understand how RAPIDS integrates with the PyData ecosystem and significantly speed up workflows, \r\n\r\nThe target audience for this tutorial is data scientists and machine learning practitioners. No prior GPU knowledge is required, but participants should have some experience with Python, pandas, and scikit-learn.", "recording_license": "", "do_not_record": false, "persons": [{"code": "3L9XAM", "name": "Naty Clementi", "avatar": "https://cfp.pydata.org/media/avatars/3L9XAM_UUErAc9.webp", "biography": "Naty is a Senior Software Engineer at [NVIDIA](https://www.nvidia.com/). She is a former academic with a Masters in Physics and PhD in Mechanical and Aerospace Engineering to her name. She is currently contributing to [RAPIDS](https://rapids.ai/), but in the past has also contributed and maintained other open source projects such as Ibis and Dask. She is also an active member of PyLadies and an active volunteer and organizer of Women and Gender Expansive Coders DC meetups.", "public_name": "Naty Clementi", "guid": "c290bfa2-07ca-5992-9b3f-10de4294ca0b", "url": "https://cfp.pydata.org/virginia2025/speaker/3L9XAM/"}, {"code": "8JC3ZJ", "name": "Mike McCarty", "avatar": "https://cfp.pydata.org/media/avatars/8JC3ZJ_TJDbrNN.webp", "biography": "Mike is a Senior Software Engineering Manager at NVIDIA working on RAPIDS where he manages teams working on RAPIDS Cloud and HPC deployments, build infrastructure and packaging, and PyData projects. He has also contributed to open source software projects in the PyData ecosystem such as Dask and Intake. He holds two bachelor\u2019s degrees in computer science and physics, and has over 20 years of experience in software engineering and scientific computing in astronomy, computational sciences, data science, machine learning, and enterprise products.", "public_name": "Mike McCarty", "guid": "d63af1fa-f63a-5e1c-aebb-76352195f153", "url": "https://cfp.pydata.org/virginia2025/speaker/8JC3ZJ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/WAWAHD/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/WAWAHD/", "attachments": []}, {"guid": "8bb77f1d-2e5f-5c86-be49-f12b558378aa", "code": "B9RT3L", "id": 77487, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/B9RT3L/chris-curry-GYpsSWHslHA_gF0xXH5.jpg", "date": "2025-04-19T15:30:00-04:00", "start": "15:30", "duration": "01:30", "room": "Room 130", "slug": "virginia2025-77487-from-pandas-to-pyspark", "url": "https://cfp.pydata.org/virginia2025/talk/B9RT3L/", "title": "From Pandas to PySpark", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "Tired of waiting for massive datasets to load on your local machine? In this beginner-friendly tutorial, we\u2019ll explore how to scale your data analysis skills from pandas to PySpark using a real-world anime dataset. We\u2019ll walk through the basics of distributed computing, discuss why Spark was created, and demonstrate the benefits of working with PySpark for big data tasks\u2014including reading, cleaning, and transforming millions of records with ease. By the end of this workshop, you\u2019ll understand how PySpark harnesses cluster computing to handle large-scale data and you\u2019ll be comfortable applying these techniques to your own projects.\r\n\r\nParticipant Requirements:\r\n- A laptop (any OS) with an internet connection\r\n- A Google account (to access Colab notebooks and slides)\r\n- Familiarity with Python and pandas\r\n\r\nHere's the link to the Google Colab to follow along \ud83d\udc47\ud83c\udffe\r\nhttps://colab.research.google.com/drive/1fi0cTQ1NIE5kDEH0ynp2sqDuVeiBJJWU?usp=sharing\r\n\r\nHere are the slides \ud83d\udc47\ud83c\udffe\r\nhttps://drive.google.com/file/d/11JIih1VzLxTJ9O6PeGzqD_e8vumTZQmw/view?usp=sharing", "description": "This tutorial aims to close the gap between small-scale data analysis and big data processing. If you\u2019ve ever tried to load a multi-gigabyte CSV into pandas or Excel, you know the frustration of crashing programs and endless waits. This tutorial shows how to level up your data skills using PySpark\u2019s distributed DataFrame API.\r\n\r\nWe\u2019ll do more than just introduce Spark concepts\u2014we\u2019ll work through a lively anime dataset full of ratings, genres, and user insights, so you can see how PySpark handles real-world tasks (like filtering, grouping, and joining) at scale. You\u2019ll get comfortable with Spark\u2019s architecture and learn how it uses lazy evaluations, cluster computing, and in-memory operations to achieve speedups. One highlight of the workshop is its hands-on approach: all exercises will be run in Google Colab. That means zero friction in setup\u2014no cluster installation or environment wrangling. We\u2019ll walk through the entire pipeline: loading massive CSV files, performing transformations that mirror pandas operations, and drawing insights through SQL-like queries.\r\n\r\nExpect a fast-paced but accessible look at Spark\u2019s key features, practical code examples, and best practices to keep your big data workflows efficient and transparent.\r\n\r\nTutorial Outline\r\n- Why Spark?: A short overview of Hadoop MapReduce and how Spark rose to address its shortcomings.\r\n- Distributed Data 101: Breaking down Spark\u2019s architecture, executors, and lazy evaluation.\r\n- Hands-On Setup: Launching PySpark in Google Colab so everyone can follow along in real time.\r\n- Exploring the Anime Dataset: Reading data from CSV, structuring DataFrames, and performing data cleaning.\r\n- Common Operations at Scale: Filtering, grouping, and aggregating millions of rows with PySpark.\r\n- Comparisons to Pandas: Mapping familiar DataFrame operations to their Spark counterparts.\r\n- Final Thoughts: Discussion of where Spark fits into modern data stacks, plus pointers for advanced usage (MLlib, streaming, cluster optimization).", "recording_license": "", "do_not_record": false, "persons": [{"code": "ZLVRZQ", "name": "Cynthia Ukawu", "avatar": "https://cfp.pydata.org/media/avatars/ZLVRZQ_Mqo0AAp.webp", "biography": "Cynthia is a geospatial software engineer with a passion for teaching and making technical concepts approachable. Currently working as a backend software engineer, she develops innovative geospatial solutions that solve real-world problems. Cynthia has a strong background in Python and data science, with experience mentoring students in data analytics at Springboard and teaching Python to beginners at Masterschool.\r\n\r\nIn addition to her professional work, Cynthia is an experienced public speaker. She\u2019s presented at PyTexas and at Arlington Code-The-Curb on her \u201cPark and Stride\u201d project\u2014a web app that helps commuters integrate walking into their daily routines. Her approachable teaching style combines hands-on learning with practical insights.\r\n\r\nOutside of work, Cynthia is passionate about graph theory, computer vision, and geospatial data. She\u2019s currently exploring the intersection of LiDAR technology and urban mobility. When she\u2019s not coding or mentoring, Cynthia enjoys dancing Samba and blogging about ways beginners can break into tech on her website, cynscode.com.", "public_name": "Cynthia Ukawu", "guid": "500a0274-5f0a-5264-ba7a-def9dc8faf41", "url": "https://cfp.pydata.org/virginia2025/speaker/ZLVRZQ/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/B9RT3L/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/B9RT3L/", "attachments": []}], "Room 140": [{"guid": "b9bc684d-259e-5757-9a84-5934277b839c", "code": "SHTFQY", "id": 77502, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/SHTFQY/image_class_tutorial_LY_2jsFsVZ.png", "date": "2025-04-19T09:00:00-04:00", "start": "09:00", "duration": "01:30", "room": "Room 140", "slug": "virginia2025-77502-tutorial-on-image-classification-using-scikit-image-scikit-learn-and-pytorch", "url": "https://cfp.pydata.org/virginia2025/talk/SHTFQY/", "title": "Tutorial on Image Classification using Scikit-Image, Scikit-learn, and PyTorch", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "Tutorial on building an image segmentation and classification pipeline for binary or multiclass classification using the popular packages scikit-learn, scikit-image and PyTorch.", "description": "Welcome to the exciting world of computer vision and machine learning!  This tutorial presents foundational computer vision operations to prepare you to build your first successful classification pipeline.  My goal is to help guide you past potential pitfalls and present topics for consideration as you embark on your machine learning journey.\r\n\r\n1. Computer Vision Basics\r\n   * The Basics\r\n   * Software and Packages\r\n2. Image Segmentation\r\n   * Preprocessing (histograms, filters)\r\n   * Thresholding\r\n   * Morphological Operators\r\n   * Advanced Segmentation\r\n3. Feature Extraction\r\n   * Textures\r\n      * GLCM\r\n      * LBP\r\n4. Model Development - scikit-learn\r\n   * scikit-learn\r\n       * Gaussian Process\r\n5. Feature Importance\r\n   * Shapley\r\n6. Neural Networks - PyTorch\r\n7.  Model Development\r\n    * CNN\r\n    * Transfer Learning\r\n8. Model Performance\r\n   * Tensorboard\r\n   * Saliency map\r\n\r\nNotebooks will be available prior to the start of the tutorial.  Please come prepared with the following python packages installed:\r\n* numpy\r\n* pandas\r\n* scikit-learn\r\n* scikit-image \r\n* torch\r\n* torchvision\r\n* pytensorboard", "recording_license": "", "do_not_record": false, "persons": [{"code": "LWQGJY", "name": "Matt Litz", "avatar": "https://cfp.pydata.org/media/avatars/LWQGJY_FfFghq8.webp", "biography": "Matt Litz is a Data Science Engineer at BWX Technologies in Lynchburg, VA. He earned his Master's in Data Science from the University of Virginia in 2023.  Primary research interests include computer vision and innovative approaches to implementing Large Language Models.", "public_name": "Matt Litz", "guid": "6151a125-8a15-598f-acc4-ceef998c2314", "url": "https://cfp.pydata.org/virginia2025/speaker/LWQGJY/"}], "links": [{"title": "Github repo for tutorial files", "url": "https://github.com/mattlitz/pydata-virginia2025-image-clf.git", "type": "related"}], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/SHTFQY/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/SHTFQY/", "attachments": [{"title": "Presentation", "url": "/media/virginia2025/submissions/SHTFQY/resources/pydata-virgi_iF5fcnB.pptx", "type": "related"}]}, {"guid": "e7eb0b37-2575-50df-850d-45501f6056c9", "code": "GYFR7G", "id": 77114, "logo": null, "date": "2025-04-19T11:00:00-04:00", "start": "11:00", "duration": "01:30", "room": "Room 140", "slug": "virginia2025-77114-a-beginner-s-guide-to-variational-inference", "url": "https://cfp.pydata.org/virginia2025/talk/GYFR7G/", "title": "A Beginner's Guide to Variational Inference", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "When Bayesian modeling scales up to large datasets, traditional MCMC methods can become impractical due to their computational demands. **Variational Inference (VI) offers a scalable alternative**, trading exactness for speed while retaining the essence of Bayesian inference. \r\n\r\nIn this tutorial, we\u2019ll **explore how to implement and compare VI techniques in PyMC**, including the Adaptive Divergence Variational Inference (ADVI) and the cutting-edge Pathfinder algorithm.\r\n\r\nStarting with simple models like linear regression, we\u2019ll gradually introduce more **complex, real-world applications**, comparing the performance of VI against Markov Chain Monte Carlo (MCMC) to understand the trade-offs in speed and accuracy. \r\n\r\n**This tutorial will arm participants with practical tools to deploy VI in their workflows** and help answer pressing questions, like \"*What do I do when MCMC is too slow?*\", or \"*How does VI compare to MCMC in terms of approximation quality?*\".", "description": "## Description\r\n\r\nThis tutorial is **for data scientists, statisticians, and machine learning practitioners who are comfortable with Python and basics of probability**.\r\n\r\nWe\u2019ll break down the mechanics of VI and its application in PyMC in an approachable way, starting with intuitive explanations and building up to practical examples.\r\n\r\nParticipants will learn how to apply ADVI and Pathfinder in PyMC and evaluate their results against MCMC, gaining insights into when and why to choose VI.\r\n\r\n### Takeaways\r\n\r\nParticipants will leave understanding:\r\n\r\n- The fundamentals of VI and how it differs from MCMC.\r\n- How to implement ADVI and Pathfinder in PyMC.\r\n- Practical considerations when selecting and evaluating inference methods.\r\n\r\n### Background Knowledge Required\r\n\r\n- Basic understanding of probability and Bayesian inference.\r\n- Familiarity with Python. Prior PyMC experience is helpful but not required.\r\n\r\n### Materials Distribution\r\n\r\nAll materials, including notebooks and datasets, will be available on GitHub.\r\n\r\n## Outline\r\n\r\n1. **Introduction: Why Variational Inference?** (10 min)\r\n- The limitations of MCMC for large datasets.\r\n- Overview of VI: How it works and why it\u2019s faster.\r\n\r\n2. **Variational Inference Basics** (20 min)\r\n- Key concepts: Evidence Lower Bound (ELBO), optimization, and approximation families.\r\n- Intuitive explanation of ADVI and Pathfinder.\r\n\r\n3. **Implementing VI with PyMC** (15 min)\r\n- Step-by-step walkthrough of VI with a linear model.\r\n- Comparing ADVI, Pathfinder, and MCMC.\r\n\r\n4. **Evaluating VI Approximations** (10 min)\r\n- How to measure the quality of VI approximations (ELBO, simulation-based calibration, etc.).\r\n- Practical trade-offs between speed and accuracy.\r\n\r\n5. **Scaling Up: Complex Models and Real-World Applications** (25 min)\r\n- Applying VI to hierarchical and large-scale models.\r\n- Tips for debugging and optimizing VI workflows.\r\n\r\n6. **Open Discussion and Q&A** (10 min)\r\n- Address audience-specific use cases and questions.", "recording_license": "", "do_not_record": false, "persons": [{"code": "P9T9SB", "name": "Chris Fonnesbeck", "avatar": "https://cfp.pydata.org/media/avatars/P9T9SB_pIhAeoa.webp", "biography": "Chris is a Principal Quantitative Analyst at PyMC Labs and an Adjoint Associate Professor at the Vanderbilt University Medical Center, with 20 years of experience as a data scientist in academia, industry, and government. He is interested in computational statistics, machine learning, Bayesian methods, and applied decision analysis. He hails from Vancouver, Canada and received his Ph.D. from the University of Georgia.\u200b\u200b", "public_name": "Chris Fonnesbeck", "guid": "68d4676d-1944-5971-81c6-3e46df57723a", "url": "https://cfp.pydata.org/virginia2025/speaker/P9T9SB/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/GYFR7G/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/GYFR7G/", "attachments": []}, {"guid": "45db58a9-e251-5eb3-bc7a-73d0cc823807", "code": "WZKH8G", "id": 77476, "logo": "https://cfp.pydata.org/media/virginia2025/submissions/WZKH8G/Wikipedia-logo-v2-wordm_aK7DOJf.png", "date": "2025-04-19T13:30:00-04:00", "start": "13:30", "duration": "01:30", "room": "Room 140", "slug": "virginia2025-77476-introduction-to-wikidata", "url": "https://cfp.pydata.org/virginia2025/talk/WZKH8G/", "title": "Introduction to Wikidata", "subtitle": "", "track": null, "type": "Tutorial", "language": "en", "abstract": "We will review Wikipedia, introduce Wikidata, then demonstrate queries to access wiki content", "description": "Wikipedia is the general reference source for humans to read. Wikidata is its interconnected, structured data complement, and accessible through queries. We will consider Wikidata's purpose, scope, and editorial community, then query for interesting results in pop culture, science, civics, and more. Attendees will learn how to access sample queries including through Jupyter Notebooks.", "recording_license": "", "do_not_record": false, "persons": [{"code": "93BW8B", "name": "Lane Rasberry", "avatar": "https://cfp.pydata.org/media/avatars/93BW8B_ybbrcAZ.webp", "biography": "Lane Rasberry is Wikimedian-in-residence at the School of Data Science at the University of Virginia. His interests include popular science, consumer protection, civic engagement, access to health information, clinical research, the Open Movement, data science, LGBT history, and Wikimedia projects.", "public_name": "Lane Rasberry", "guid": "5152b0d3-27f9-5297-b1e3-47cf87075f97", "url": "https://cfp.pydata.org/virginia2025/speaker/93BW8B/"}, {"code": "GDBPAM", "name": "Robin Isadora Brown", "avatar": "https://cfp.pydata.org/media/avatars/GDBPAM_gYMqyhW.webp", "biography": "Researcher", "public_name": "Robin Isadora Brown", "guid": "b252c9fb-6a5d-51d5-984b-01b3c6804daf", "url": "https://cfp.pydata.org/virginia2025/speaker/GDBPAM/"}], "links": [], "feedback_url": "https://cfp.pydata.org/virginia2025/talk/WZKH8G/feedback/", "origin_url": "https://cfp.pydata.org/virginia2025/talk/WZKH8G/", "attachments": []}]}}]}}}