2025 |
Soumyadip Ghosh, Peter J Haas, L Jeff Hong, Jonathan Ozik, Benjamin Thengvall Simulation Optimization 2050 and Beyond Journal Article 2025 Winter Simulation Conference (WSC), 2025. @article{Ghosh2025, title = {Simulation Optimization 2050 and Beyond}, author = {Soumyadip Ghosh and Peter J Haas and L Jeff Hong and Jonathan Ozik and Benjamin Thengvall}, year = {2025}, date = {2025-12-07}, journal = {2025 Winter Simulation Conference (WSC)}, abstract = {The goal of this panel was to envision the future of simulation optimization research and practice over the next 25 years. The panel was composed of five simulation researchers from academia, industry, and research laboratories who shared their perspectives on the challenges and opportunities facing the field in light of contemporary advances in artificial intelligence, machine learning and computing hardware. The panelists also discussed the role simulation optimization can, should, and will play in supporting future decision-making under uncertainty. This paper serves as a collection of the panelists' prepared statements.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The goal of this panel was to envision the future of simulation optimization research and practice over the next 25 years. The panel was composed of five simulation researchers from academia, industry, and research laboratories who shared their perspectives on the challenges and opportunities facing the field in light of contemporary advances in artificial intelligence, machine learning and computing hardware. The panelists also discussed the role simulation optimization can, should, and will play in supporting future decision-making under uncertainty. This paper serves as a collection of the panelists' prepared statements. |
Sneha Gathani, Kevin Li, Raghav Thind, Sirui Zeng, Matthew Xu, Peter J Haas, Cagatay Demiralp, Zhicheng Liu PRAXA: A Framework for What-If Analysis Journal Article arXiv preprint arXiv:2510.09791, 2025. @article{Gathani2025b, title = {PRAXA: A Framework for What-If Analysis}, author = {Sneha Gathani and Kevin Li and Raghav Thind, Sirui Zeng and Matthew Xu and Peter J Haas and Cagatay Demiralp and Zhicheng Liu}, url = {https://arxiv.org/abs/2510.09791}, year = {2025}, date = {2025-10-10}, journal = {arXiv preprint arXiv:2510.09791}, abstract = {arious analytical techniques-such as scenario modeling, sensitivity analysis, perturbation-based analysis, counterfactual analysis, and parameter space analysis-are used across domains to explore hypothetical scenarios, examine input-output relationships, and identify pathways to desired results. Although termed differently, these methods share common concepts and methods, suggesting unification under what-if analysis. Yet a unified framework to define motivations, core components, and its distinct types is lacking. To address this gap, we reviewed 141 publications from leading visual analytics and HCI venues (2014-2024). Our analysis (1) outlines the motivations for what-if analysis, (2) introduces Praxa, a structured framework that identifies its fundamental components and characterizes its distinct types, and (3) highlights challenges associated with the application and implementation. Together, our findings establish a standardized vocabulary and structural understanding, enabling more consistent use across domains and communicate with greater conceptual clarity. Finally, we identify open research problems and future directions to advance what-if analysis.}, keywords = {}, pubstate = {published}, tppubtype = {article} } arious analytical techniques-such as scenario modeling, sensitivity analysis, perturbation-based analysis, counterfactual analysis, and parameter space analysis-are used across domains to explore hypothetical scenarios, examine input-output relationships, and identify pathways to desired results. Although termed differently, these methods share common concepts and methods, suggesting unification under what-if analysis. Yet a unified framework to define motivations, core components, and its distinct types is lacking. To address this gap, we reviewed 141 publications from leading visual analytics and HCI venues (2014-2024). Our analysis (1) outlines the motivations for what-if analysis, (2) introduces Praxa, a structured framework that identifies its fundamental components and characterizes its distinct types, and (3) highlights challenges associated with the application and implementation. Together, our findings establish a standardized vocabulary and structural understanding, enabling more consistent use across domains and communicate with greater conceptual clarity. Finally, we identify open research problems and future directions to advance what-if analysis. |
Mohammad Hadi Nezhad, Francisco Enrique Vicente Castro, Eugene Mak, Peter J Haas, Danielle Allessio, Leon Osterweil, Injila Rasul, Heather Conboy, Ivon Arroyo Springer Nature Switzerland, 2025. @book{Nezhad2025, title = {Embedding Ethical Awareness in Computer Science and AI Education: The PEaRCE Approach to Responsible Computing}, author = {Mohammad Hadi Nezhad and Francisco Enrique Vicente Castro and Eugene Mak and Peter J Haas and Danielle Allessio and Leon Osterweil and Injila Rasul and Heather Conboy and Ivon Arroyo}, url = {https://link.springer.com/chapter/10.1007/978-3-031-98414-3_10}, year = {2025}, date = {2025-07-15}, journal = {International Conference on Artificial Intelligence in Education}, pages = {135-149}, publisher = {Springer Nature Switzerland}, abstract = {Despite the widespread use of computer systems, their societal impacts are often poorly understood, highlighting the need for the AIED community to acknowledge and contribute to incorporating Ethical and Responsible Computing (ERC) into Computer Science (CS) education. We introduce the Platform for Ethical and Responsible Computing Education (PEaRCE), an interactive tool created to integrate ERC into post-secondary education through realistic workplace simulations. PEaRCE scenarios guide students through ERC quandaries—awareness, decision-making, feedback—during the processes of developing advanced AIED and other technologies that may have beneficial and harmful impacts. Moreover, we integrate PEaRCE into CS courses via a sequence of structured learning modules and trained “ERC Teaching Assistants (TAs)” to support the integration process. We present insights from our deployment experiences, show PEaRCE’s potential to enhance ERC awareness and reasoning, and discuss the possibility of embedding PEaRCE into AI in Education courses.}, keywords = {}, pubstate = {published}, tppubtype = {book} } Despite the widespread use of computer systems, their societal impacts are often poorly understood, highlighting the need for the AIED community to acknowledge and contribute to incorporating Ethical and Responsible Computing (ERC) into Computer Science (CS) education. We introduce the Platform for Ethical and Responsible Computing Education (PEaRCE), an interactive tool created to integrate ERC into post-secondary education through realistic workplace simulations. PEaRCE scenarios guide students through ERC quandaries—awareness, decision-making, feedback—during the processes of developing advanced AIED and other technologies that may have beneficial and harmful impacts. Moreover, we integrate PEaRCE into CS courses via a sequence of structured learning modules and trained “ERC Teaching Assistants (TAs)” to support the integration process. We present insights from our deployment experiences, show PEaRCE’s potential to enhance ERC awareness and reasoning, and discuss the possibility of embedding PEaRCE into AI in Education courses. |
Sneha Gathani, Zhicheng Liu, Peter J Haas, Çağatay Demiralp What-if analysis for business professionals: Current practices and future opportunities Journal Article Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems, pp. 1-17, 2025. @article{Gathani2025, title = {What-if analysis for business professionals: Current practices and future opportunities}, author = {Sneha Gathani and Zhicheng Liu and Peter J Haas and Çağatay Demiralp}, url = {https://dl.acm.org/doi/full/10.1145/3706598.3713672}, year = {2025}, date = {2025-04-26}, journal = {Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems}, pages = {1-17}, abstract = {What-if analysis (WIA) is essential for data-driven decision-making, allowing users to assess how changes in variables impact outcomes and explore alternative scenarios. Existing WIA research primarily supports the workflows of data scientists and analysts, and largely overlooks business professionals who engage in WIA through non-technical means. To bridge this gap, we conduct a two-part user study with 22 business professionals across marketing, sales, product, and operations roles. The first study examines their existing WIA practices, tools, and challenges. Findings reveal that business professionals perform many WIA techniques independently using rudimentary tools due to various constraints. We then implement representative WIA techniques in a visual analytics prototype and use it as a probe to conduct a follow-up study evaluating business professionals’ practical use of the techniques. Results show that these techniques improve decision-making efficiency and confidence while underscoring the need for better support in data preparation, risk assessment, and domain knowledge integration. Finally, we offer design recommendations to enhance future business analytics systems.}, keywords = {}, pubstate = {published}, tppubtype = {article} } What-if analysis (WIA) is essential for data-driven decision-making, allowing users to assess how changes in variables impact outcomes and explore alternative scenarios. Existing WIA research primarily supports the workflows of data scientists and analysts, and largely overlooks business professionals who engage in WIA through non-technical means. To bridge this gap, we conduct a two-part user study with 22 business professionals across marketing, sales, product, and operations roles. The first study examines their existing WIA practices, tools, and challenges. Findings reveal that business professionals perform many WIA techniques independently using rudimentary tools due to various constraints. We then implement representative WIA techniques in a visual analytics prototype and use it as a probe to conduct a follow-up study evaluating business professionals’ practical use of the techniques. Results show that these techniques improve decision-making efficiency and confidence while underscoring the need for better support in data preparation, risk assessment, and domain knowledge integration. Finally, we offer design recommendations to enhance future business analytics systems. |
Amir Khosheghbal, Peter J Haas, Chaitra Gopalappa Mechanistic modeling of social conditions in disease-prediction simulations via copulas and probabilistic graphical models: HIV case study Journal Article Health Care Management Science, 28 , pp. 28-49, 2025. @article{Khosheghbal2025, title = {Mechanistic modeling of social conditions in disease-prediction simulations via copulas and probabilistic graphical models: HIV case study}, author = {Amir Khosheghbal and Peter J Haas and Chaitra Gopalappa}, url = {https://link.springer.com/article/10.1007/s10729-024-09694-3}, year = {2025}, date = {2025-03-05}, journal = {Health Care Management Science}, volume = {28 }, pages = {28-49}, abstract = {As social and economic conditions are key determinants of HIV, the United States ‘National HIV/AIDS Strategy (NHAS)’, in addition to care and treatment, aims to address mental health, unemployment, food insecurity, and housing instability, as part of its strategic plan for the ‘Ending the HIV Epidemic’ initiative. Although mechanistic models of HIV play a key role in evaluating intervention strategies, social conditions are typically not part of the modeling framework. Challenges include the unavailability of coherent statistical data for social conditions and behaviors. We developed a method, combining undirected graphical modeling with copula methods, to integrate disparate data sources, to estimate joint probability distributions for social conditions and behaviors. We incorporated these in a national-level network model, Progression and Transmission of HIV (PATH 4.0), to simulate behaviors as functions of social conditions and HIV transmissions as a function of behaviors. As a demonstration for the potential applications of such a model, we conducted two hypothetical what-if intervention analyses to estimate the impact of an ideal 100% efficacious intervention strategy. The first analysis modeled care behavior (using viral suppression as proxy) as a function of depression, neighborhood, housing, poverty, education, insurance, and employment status. The second modeled sexual behaviors (number of partners and condom-use) as functions of employment, housing, poverty, and education status, among persons who exchange sex. HIV transmissions and disease progression were then simulated as functions of behaviors to estimate incidence reductions. Social determinants are key drivers of many infectious and non-infectious diseases. Our work enables the development of decision support tools to holistically evaluate the syndemics of health and social inequity. }, keywords = {}, pubstate = {published}, tppubtype = {article} } As social and economic conditions are key determinants of HIV, the United States ‘National HIV/AIDS Strategy (NHAS)’, in addition to care and treatment, aims to address mental health, unemployment, food insecurity, and housing instability, as part of its strategic plan for the ‘Ending the HIV Epidemic’ initiative. Although mechanistic models of HIV play a key role in evaluating intervention strategies, social conditions are typically not part of the modeling framework. Challenges include the unavailability of coherent statistical data for social conditions and behaviors. We developed a method, combining undirected graphical modeling with copula methods, to integrate disparate data sources, to estimate joint probability distributions for social conditions and behaviors. We incorporated these in a national-level network model, Progression and Transmission of HIV (PATH 4.0), to simulate behaviors as functions of social conditions and HIV transmissions as a function of behaviors. As a demonstration for the potential applications of such a model, we conducted two hypothetical what-if intervention analyses to estimate the impact of an ideal 100% efficacious intervention strategy. The first analysis modeled care behavior (using viral suppression as proxy) as a function of depression, neighborhood, housing, poverty, education, insurance, and employment status. The second modeled sexual behaviors (number of partners and condom-use) as functions of employment, housing, poverty, and education status, among persons who exchange sex. HIV transmissions and disease progression were then simulated as functions of behaviors to estimate incidence reductions. Social determinants are key drivers of many infectious and non-infectious diseases. Our work enables the development of decision support tools to holistically evaluate the syndemics of health and social inequity. |
Alexandra Meliou, Azza Abouzied, Peter J Haas, Riddho R Haque, Anh Mai, Vasileios Vittis Data management perspectives on prescriptive analytics (invited talk) Presentation 14.05.2025. @misc{Meliou2025, title = {Data management perspectives on prescriptive analytics (invited talk)}, author = {Alexandra Meliou and Azza Abouzied and Peter J Haas and Riddho R Haque and Anh Mai and Vasileios Vittis}, url = {https://par.nsf.gov/biblio/10627357}, year = {2025}, date = {2025-05-14}, abstract = {Decision makers in a broad range of domains, such as finance, transportation, manufacturing, and healthcare, often need to derive optimal decisions given a set of constraints and objectives. Traditional solutions to such constrained optimization problems are typically application-specific, complex, and do not generalize. Further, the usual workflow requires slow, cumbersome, and error-prone data movement between a database, and predictive-modeling and optimization packages. All of these problems are exacerbated by the unprecedented size of modern data-intensive optimization problems. The emerging research area of in-database prescriptive analytics aims to provide seamless domain-independent, declarative, and scalable approaches powered by the system where the data typically resides: the database. Integrating optimization with database technology opens up prescriptive analytics to a much broader community, amplifying its benefits. We discuss how deep integration between the DBMS, predictive models, and optimization software creates opportunities for rich prescriptive-query functionality with good scalability and performance. Summarizing some of our main results and ongoing work in this area, we highlight challenges related to usability, scalability, data uncertainty, and dynamic environments, and argue that perspectives from data management research can drive novel strategies and solutions.}, keywords = {}, pubstate = {published}, tppubtype = {presentation} } Decision makers in a broad range of domains, such as finance, transportation, manufacturing, and healthcare, often need to derive optimal decisions given a set of constraints and objectives. Traditional solutions to such constrained optimization problems are typically application-specific, complex, and do not generalize. Further, the usual workflow requires slow, cumbersome, and error-prone data movement between a database, and predictive-modeling and optimization packages. All of these problems are exacerbated by the unprecedented size of modern data-intensive optimization problems. The emerging research area of in-database prescriptive analytics aims to provide seamless domain-independent, declarative, and scalable approaches powered by the system where the data typically resides: the database. Integrating optimization with database technology opens up prescriptive analytics to a much broader community, amplifying its benefits. We discuss how deep integration between the DBMS, predictive models, and optimization software creates opportunities for rich prescriptive-query functionality with good scalability and performance. Summarizing some of our main results and ongoing work in this area, we highlight challenges related to usability, scalability, data uncertainty, and dynamic environments, and argue that perspectives from data management research can drive novel strategies and solutions. |
2024 |
Peter J Haas Tutorial: Artificial Neural Networks for Discrete-Event Simulation Journal Article 2024 Winter Simulation Conference (WSC), 2024. @article{Haas2024, title = {Tutorial: Artificial Neural Networks for Discrete-Event Simulation}, author = {Peter J Haas}, url = {https://ieeexplore.ieee.org/abstract/document/10838940}, doi = {10.1109/WSC63780.2024.10838940}, year = {2024}, date = {2024-12-15}, journal = {2024 Winter Simulation Conference (WSC)}, abstract = {This advanced tutorial explores some recent applications of artificial neural networks (ANNs) to stochastic discrete-event simulation (DES). We first review some basic concepts and then give examples of how ANNs are being used in the context of DES to facilitate simulation input modeling, random variate generation, simulation metamodeling, optimization via simulation, and more. Combining ANNs and DES allows exploitation of the deep domain knowledge embodied in simulation models while simultaneously leveraging the ability of modern ML techniques to capture complex patterns and relationships in data. }, keywords = {}, pubstate = {published}, tppubtype = {article} } This advanced tutorial explores some recent applications of artificial neural networks (ANNs) to stochastic discrete-event simulation (DES). We first review some basic concepts and then give examples of how ANNs are being used in the context of DES to facilitate simulation input modeling, random variate generation, simulation metamodeling, optimization via simulation, and more. Combining ANNs and DES allows exploitation of the deep domain knowledge embodied in simulation models while simultaneously leveraging the ability of modern ML techniques to capture complex patterns and relationships in data. |
Riddho R Haque, Anh L Mai, Matteo Brucato, Azza Abouzied, Peter J Haas, Alexandra Meliou Stochastic sketchrefine: Scaling in-database decision-making under uncertainty to millions of tuples Journal Article arXiv preprint arXiv:2411.17915, 2024. @article{Haque2024, title = {Stochastic sketchrefine: Scaling in-database decision-making under uncertainty to millions of tuples}, author = {Riddho R Haque and Anh L Mai and Matteo Brucato and Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {https://arxiv.org/abs/2411.17915}, year = {2024}, date = {2024-11-24}, journal = {arXiv preprint arXiv:2411.17915}, abstract = {Decision making under uncertainty often requires choosing packages, or bags of tuples, that collectively optimize expected outcomes while limiting risks. Processing Stochastic Package Queries (SPQs) involves solving very large optimization problems on uncertain data. Monte Carlo methods create numerous scenarios, or sample realizations of the stochastic attributes of all the tuples, and generate packages with optimal objective values across these scenarios. The number of scenarios needed for accurate approximation - and hence the size of the optimization problem when using prior methods - increases with variance in the data, and the search space of the optimization problem increases exponentially with the number of tuples in the relation. Existing solvers take hours to process SPQs on large relations containing stochastic attributes with high variance. Besides enriching the SPaQL language to capture a broader class of risk specifications, we make two fundamental contributions towards scalable SPQ processing. First, to handle high variance, we propose risk-constraint linearization (RCL), which converts SPQs into Integer Linear Programs (ILPs) whose size is independent of the number of scenarios used. Solving these ILPs gives us feasible and near-optimal packages. Second, we propose Stochastic SketchRefine, a divide and conquer framework that breaks down a large stochastic optimization problem into subproblems involving smaller subsets of tuples. Our experiments show that, together, RCL and Stochastic SketchRefine produce high-quality packages in orders of magnitude lower runtime than the state of the art.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Decision making under uncertainty often requires choosing packages, or bags of tuples, that collectively optimize expected outcomes while limiting risks. Processing Stochastic Package Queries (SPQs) involves solving very large optimization problems on uncertain data. Monte Carlo methods create numerous scenarios, or sample realizations of the stochastic attributes of all the tuples, and generate packages with optimal objective values across these scenarios. The number of scenarios needed for accurate approximation - and hence the size of the optimization problem when using prior methods - increases with variance in the data, and the search space of the optimization problem increases exponentially with the number of tuples in the relation. Existing solvers take hours to process SPQs on large relations containing stochastic attributes with high variance. Besides enriching the SPaQL language to capture a broader class of risk specifications, we make two fundamental contributions towards scalable SPQ processing. First, to handle high variance, we propose risk-constraint linearization (RCL), which converts SPQs into Integer Linear Programs (ILPs) whose size is independent of the number of scenarios used. Solving these ILPs gives us feasible and near-optimal packages. Second, we propose Stochastic SketchRefine, a divide and conquer framework that breaks down a large stochastic optimization problem into subproblems involving smaller subsets of tuples. Our experiments show that, together, RCL and Stochastic SketchRefine produce high-quality packages in orders of magnitude lower runtime than the state of the art. |
2023 |
Wang Cen, Peter J Haas Efficient Hybrid Simulation Optimization via Graph Neural Network Metamodeling Journal Article 2023 Winter Simulation Conference (WSC), 2023. @article{Cen2023, title = {Efficient Hybrid Simulation Optimization via Graph Neural Network Metamodeling}, author = {Wang Cen and Peter J Haas}, doi = {10.1109/WSC60868.2023.10408474}, year = {2023}, date = {2023-12-10}, journal = {2023 Winter Simulation Conference (WSC)}, abstract = {Simulation metamodeling is essential for speeding up optimization via simulation to support rapid decision making. During optimization, the metamodel, rather than expensive simulation, is used to compute objective values. We recently developed graphical neural metamodels (GMMs) that use graph neural networks to allow the graphical structure of a simulation model to be treated as a metamodel input parameter that can be varied along with scalar inputs. In this paper we provide novel methods for using GMMs to solve hybrid optimization problems where both real-valued input parameters and graphical structure are jointly optimized. The key ideas are to modify Monte Carlo tree search to incorporate both discrete and continuous optimization and to leverage the automatic differentiation infrastructure used for neural network training to quickly compute gradients of the objective function during stochastic gradient descent. Experiments on stochastic activity network and warehouse models demonstrate the potential of our method. }, keywords = {}, pubstate = {published}, tppubtype = {article} } Simulation metamodeling is essential for speeding up optimization via simulation to support rapid decision making. During optimization, the metamodel, rather than expensive simulation, is used to compute objective values. We recently developed graphical neural metamodels (GMMs) that use graph neural networks to allow the graphical structure of a simulation model to be treated as a metamodel input parameter that can be varied along with scalar inputs. In this paper we provide novel methods for using GMMs to solve hybrid optimization problems where both real-valued input parameters and graphical structure are jointly optimized. The key ideas are to modify Monte Carlo tree search to incorporate both discrete and continuous optimization and to leverage the automatic differentiation infrastructure used for neural network training to quickly compute gradients of the objective function during stochastic gradient descent. Experiments on stochastic activity network and warehouse models demonstrate the potential of our method. |
Pracheta Amaranath, Peter J. Haas, David Jensen, Sam Witty Causal Dynamic Bayesian Networks for Simulation Metamodeling Journal Article 2023 Winter Simulation Conference (WSC), pp. 746-757, 2023. @article{test, title = {Causal Dynamic Bayesian Networks for Simulation Metamodeling}, author = {Pracheta Amaranath and Peter J. Haas and David Jensen and Sam Witty}, url = {https://ieeexplore.ieee.org/abstract/document/10407746}, doi = {10.1109/WSC60868.2023.10407746}, year = {2023}, date = {2023-12-10}, journal = {2023 Winter Simulation Conference (WSC)}, pages = {746-757}, abstract = {A traditional metamodel for a discrete-event simulation approximates a real-valued performance measure as a function of the input-parameter values. We introduce a novel class of metamodels based on modular dynamic Bayesian networks (MDBNs), a subclass of probabilistic graphical models which can be used to efficiently answer a rich class of probabilistic and causal queries (PCQs). Such queries represent the joint probability distribution of the system state at multiple time points, given observations of, and interventions on, other state variables and input parameters. This paper is a first demonstration of how the extensive theory and technology of causal graphical models can be used to enhance simulation metamodeling. We demonstrate this potential by showing how a single MDBN for an M/M/1 queue can be learned from simulation data and then be used to quickly and accurately answer a variety of PCQs, most of which are out-of-scope for existing metamodels.}, keywords = {}, pubstate = {published}, tppubtype = {article} } A traditional metamodel for a discrete-event simulation approximates a real-valued performance measure as a function of the input-parameter values. We introduce a novel class of metamodels based on modular dynamic Bayesian networks (MDBNs), a subclass of probabilistic graphical models which can be used to efficiently answer a rich class of probabilistic and causal queries (PCQs). Such queries represent the joint probability distribution of the system state at multiple time points, given observations of, and interventions on, other state variables and input parameters. This paper is a first demonstration of how the extensive theory and technology of causal graphical models can be used to enhance simulation metamodeling. We demonstrate this potential by showing how a single MDBN for an M/M/1 queue can be learned from simulation data and then be used to quickly and accurately answer a variety of PCQs, most of which are out-of-scope for existing metamodels. |
Juelin Liu, Sandeep Polisetty, Hui Guan, Marco Serafini GraphMini: Accelerating Graph Pattern Matching Using Auxiliary Graphs Journal Article International Conference on Parallel Architectures and Compilation Techniques, 2023. @article{liu2023graphmini, title = {GraphMini: Accelerating Graph Pattern Matching Using Auxiliary Graphs}, author = {Juelin Liu and Sandeep Polisetty and Hui Guan and Marco Serafini}, url = {https://marcoserafini.github.io/assets/pdf/GraphMini.pdf}, year = {2023}, date = {2023-11-01}, journal = {International Conference on Parallel Architectures and Compilation Techniques}, abstract = {Graph pattern matching is a fundamental problem encountered by many common graph mining tasks and the basic building block of several graph mining systems. This paper explores for the first time how to proactively prune graphs to speed up graph pattern matching by leveraging the structure of the query pattern and the input graph. We propose building auxiliary graphs, which are different pruned versions of the graph, during query execution. This requires careful balancing between the upfront cost of building and managing auxiliary graphs and the gains of faster set operations. To this end, we propose GraphMini, a new system that uses query compilation and a new cost model to minimize the cost of building and maintaining auxiliary graphs and maximize gains. Our evaluation shows that using GraphMini can achieve one order of magnitude speedup compared to state-of-the-art subgraph enumeration systems on commonly used benchmarks.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Graph pattern matching is a fundamental problem encountered by many common graph mining tasks and the basic building block of several graph mining systems. This paper explores for the first time how to proactively prune graphs to speed up graph pattern matching by leveraging the structure of the query pattern and the input graph. We propose building auxiliary graphs, which are different pruned versions of the graph, during query execution. This requires careful balancing between the upfront cost of building and managing auxiliary graphs and the gains of faster set operations. To this end, we propose GraphMini, a new system that uses query compilation and a new cost model to minimize the cost of building and maintaining auxiliary graphs and maximize gains. Our evaluation shows that using GraphMini can achieve one order of magnitude speedup compared to state-of-the-art subgraph enumeration systems on commonly used benchmarks. |
Francisco Castro, Sahitya Raipura, Heather Conboy, Peter J Haas, Leon Osterweil, Ivon Arroyo Piloting an Interactive Ethics and Responsible Computing Learning Environment in Undergraduate CS Courses Journal Article Proceedings of the 54th ACM Technical Symposium on Computer Science Education, 2023. @article{castro2023piloting, title = {Piloting an Interactive Ethics and Responsible Computing Learning Environment in Undergraduate CS Courses}, author = {Francisco Castro and Sahitya Raipura and Heather Conboy and Peter J Haas and Leon Osterweil and Ivon Arroyo}, year = {2023}, date = {2023-11-01}, journal = {Proceedings of the 54th ACM Technical Symposium on Computer Science Education}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Wang Cen, Peter J Haas NIM: Generative Neural Networks for Automated Modeling and Generation of Simulation Inputs Journal Article ACM Transactions on Modeling and Computer Simulation, 33 (3), pp. 1–26, 2023. @article{cen2023nim, title = {NIM: Generative Neural Networks for Automated Modeling and Generation of Simulation Inputs}, author = {Wang Cen and Peter J Haas}, year = {2023}, date = {2023-11-01}, journal = {ACM Transactions on Modeling and Computer Simulation}, volume = {33}, number = {3}, pages = {1--26}, abstract = {Fitting stochastic input-process models to data and then sampling from them are key steps in a simulation study, but highly challenging to non-experts. We present Neural Input Modeling (NIM), a generative-neural-network (GNN) framework that exploits modern data-rich environments to automatically capture simulation input processes and then generate samples from them. The basic GNN that we develop, called NIM-VL, comprises (i) a variational-autoencoder (VAE) architecture that learns the probability distribution of the input data while avoiding overfitting and (ii) Long Short-Term Memory (LSTM) components that concisely capture statistical dependencies across time. We show how the basic GNN architecture can be modified to exploit known distributional properties—such as i.i.d. structure, nonnegativity, and multimodality—in order to increase accuracy and speed, as well as to handle multivariate processes, categorical-valued processes, and extrapolation beyond the training data for certain nonstationary processes. We also introduce an extension to NIM called “conditional” NIM (CNIM), which can learn from training data obtained under various realizations of a (possibly time-series-valued) stochastic “condition”, such as temperature or inflation rate, and then generate sample paths given a value of the condition not seen in the training data. This enables users to simulate a system under a specific working condition by customizing a pre-trained model; CNIM also facilitates what-if analysis. Extensive experiments show the efficacy of our approach. NIM can thus help overcome one of the key barriers to simulation for non-experts.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Fitting stochastic input-process models to data and then sampling from them are key steps in a simulation study, but highly challenging to non-experts. We present Neural Input Modeling (NIM), a generative-neural-network (GNN) framework that exploits modern data-rich environments to automatically capture simulation input processes and then generate samples from them. The basic GNN that we develop, called NIM-VL, comprises (i) a variational-autoencoder (VAE) architecture that learns the probability distribution of the input data while avoiding overfitting and (ii) Long Short-Term Memory (LSTM) components that concisely capture statistical dependencies across time. We show how the basic GNN architecture can be modified to exploit known distributional properties—such as i.i.d. structure, nonnegativity, and multimodality—in order to increase accuracy and speed, as well as to handle multivariate processes, categorical-valued processes, and extrapolation beyond the training data for certain nonstationary processes. We also introduce an extension to NIM called “conditional” NIM (CNIM), which can learn from training data obtained under various realizations of a (possibly time-series-valued) stochastic “condition”, such as temperature or inflation rate, and then generate sample paths given a value of the condition not seen in the training data. This enables users to simulate a system under a specific working condition by customizing a pre-trained model; CNIM also facilitates what-if analysis. Extensive experiments show the efficacy of our approach. NIM can thus help overcome one of the key barriers to simulation for non-experts. |
Brian Hentschel, Peter J Haas, Yuanyuan Tian Exact PPS Sampling with Bounded Sample Size Journal Article Information Processing Letters, 182 , 2023. @article{hentschel2023exact, title = {Exact PPS Sampling with Bounded Sample Size}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {https://www.sciencedirect.com/science/article/pii/S002001902300025X?casa_token=znQuScsK51AAAAAA:0EEwp2QPaW7PetEN0NGy8wN0dsOC6Thx9voXduNx6ivnyMIg0WVuMI83TVVt2yiXqu0n4Atnwg}, year = {2023}, date = {2023-08-01}, journal = {Information Processing Letters}, volume = {182}, abstract = {Probability proportional to size (PPS) sampling schemes with a target sample size aim to produce a sample comprising a specified number n of items while ensuring that each item in the population appears in the sample with a probability proportional to its specified "weight" (also called its "size"). These two objectives, however, cannot always be achieved simultaneously. Existing PPS schemes prioritize control of the sample size, violating the PPS property if necessary. We provide a new PPS scheme that allows a different trade-off: our method enforces the PPS property at all times while ensuring that the sample size never exceeds the target value n. The sample size is exactly equal to n if possible, and otherwise has maximal expected value and minimal variance. Thus we bound the sample size, thereby avoiding storage overflows and helping to control the time required for analytics over the sample, while allowing the user complete control over the sample contents. The method is both simple to implement and efficient, being a one-pass streaming algorithm with an amortized processing time of O(1) per item.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Probability proportional to size (PPS) sampling schemes with a target sample size aim to produce a sample comprising a specified number n of items while ensuring that each item in the population appears in the sample with a probability proportional to its specified "weight" (also called its "size"). These two objectives, however, cannot always be achieved simultaneously. Existing PPS schemes prioritize control of the sample size, violating the PPS property if necessary. We provide a new PPS scheme that allows a different trade-off: our method enforces the PPS property at all times while ensuring that the sample size never exceeds the target value n. The sample size is exactly equal to n if possible, and otherwise has maximal expected value and minimal variance. Thus we bound the sample size, thereby avoiding storage overflows and helping to control the time required for analytics over the sample, while allowing the user complete control over the sample contents. The method is both simple to implement and efficient, being a one-pass streaming algorithm with an amortized processing time of O(1) per item. |
Chaitra Gopalappa, Hari Balasubramanian, Peter J Haas Infectious Disease Modelling, 8 (1), pp. 84-100, 2023. @article{gopalappa2023new, title = {A new mixed agent-based network and compartmental simulation framework for joint modeling of related infectious diseases- application to sexually transmitted infections}, author = {Chaitra Gopalappa and Hari Balasubramanian and Peter J Haas}, doi = {https://doi.org/10.1016/j.idm.2022.12.003}, year = {2023}, date = {2023-03-01}, journal = {Infectious Disease Modelling}, volume = {8}, number = {1}, pages = {84-100}, abstract = {Background A model that jointly simulates infectious diseases with common modes of transmission can serve as a decision-analytic tool to identify optimal intervention combinations for overall disease prevention. In the United States, sexually transmitted infections (STIs) are a huge economic burden, with a large fraction of the burden attributed to HIV. Data also show interactions between HIV and other sexually transmitted infections (STIs), such as higher risk of acquisition and progression of co-infections among persons with HIV compared to persons without. However, given the wide range in prevalence and incidence burdens of STIs, current compartmental or agent-based network simulation methods alone are insufficient or computationally burdensome for joint disease modeling. Further, causal factors for higher risk of coinfection could be both behavioral (i.e., compounding effects of individual behaviors, network structures, and care behaviors) and biological (i.e., presence of one disease can biologically increase the risk of another). However, the data on the fraction attributed to each are limited. Methods We present a new mixed agent-based compartmental (MAC) framework for jointly modeling STIs. It uses a combination of a new agent-based evolving network modeling (ABENM) technique for lower-prevalence diseases and compartmental modeling for higher-prevalence diseases. As a demonstration, we applied MAC to simulate lower-prevalence HIV in the United States and a higher-prevalence hypothetical Disease 2, using a range of transmission and progression rates to generate burdens replicative of the wide range of STIs. We simulated sexual transmissions among heterosexual males, heterosexual females, and men who have sex with men (men only and men and women). Setting the biological risk of co-infection to zero, we conducted numerical analyses to evaluate the influence of behavioral factors alone on disease dynamics. Results The contribution of behavioral factors to risk of coinfection was sensitive to disease burden, care access, and population heterogeneity and mixing. The contribution of behavioral factors was generally lower than observed risk of coinfections for the range of hypothetical prevalence studied here, suggesting potential role of biological factors, that should be investigated further specific to an STI. Conclusions The purpose of this study is to present a new simulation technique for jointly modeling infectious diseases that have common modes of transmission but varying epidemiological features. The numerical analysis serves as proof-of-concept for the application to STIs. Interactions between diseases are influenced by behavioral factors, are sensitive to care access and population features, and are likely exacerbated by biological factors. Social and economic conditions are among key drivers of behaviors that increase STI transmission, and thus, structural interventions are a key part of behavioral interventions. Joint modeling of diseases helps comprehensively simulate behavioral and biological factors of disease interactions to evaluate the true impact of common structural interventions on overall disease prevention. The new simulation framework is especially suited to simulate behavior as a function of social determinants, and further, to identify optimal combinations of common structural and disease-specific interventions.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Background A model that jointly simulates infectious diseases with common modes of transmission can serve as a decision-analytic tool to identify optimal intervention combinations for overall disease prevention. In the United States, sexually transmitted infections (STIs) are a huge economic burden, with a large fraction of the burden attributed to HIV. Data also show interactions between HIV and other sexually transmitted infections (STIs), such as higher risk of acquisition and progression of co-infections among persons with HIV compared to persons without. However, given the wide range in prevalence and incidence burdens of STIs, current compartmental or agent-based network simulation methods alone are insufficient or computationally burdensome for joint disease modeling. Further, causal factors for higher risk of coinfection could be both behavioral (i.e., compounding effects of individual behaviors, network structures, and care behaviors) and biological (i.e., presence of one disease can biologically increase the risk of another). However, the data on the fraction attributed to each are limited. Methods We present a new mixed agent-based compartmental (MAC) framework for jointly modeling STIs. It uses a combination of a new agent-based evolving network modeling (ABENM) technique for lower-prevalence diseases and compartmental modeling for higher-prevalence diseases. As a demonstration, we applied MAC to simulate lower-prevalence HIV in the United States and a higher-prevalence hypothetical Disease 2, using a range of transmission and progression rates to generate burdens replicative of the wide range of STIs. We simulated sexual transmissions among heterosexual males, heterosexual females, and men who have sex with men (men only and men and women). Setting the biological risk of co-infection to zero, we conducted numerical analyses to evaluate the influence of behavioral factors alone on disease dynamics. Results The contribution of behavioral factors to risk of coinfection was sensitive to disease burden, care access, and population heterogeneity and mixing. The contribution of behavioral factors was generally lower than observed risk of coinfections for the range of hypothetical prevalence studied here, suggesting potential role of biological factors, that should be investigated further specific to an STI. Conclusions The purpose of this study is to present a new simulation technique for jointly modeling infectious diseases that have common modes of transmission but varying epidemiological features. The numerical analysis serves as proof-of-concept for the application to STIs. Interactions between diseases are influenced by behavioral factors, are sensitive to care access and population features, and are likely exacerbated by biological factors. Social and economic conditions are among key drivers of behaviors that increase STI transmission, and thus, structural interventions are a key part of behavioral interventions. Joint modeling of diseases helps comprehensively simulate behavioral and biological factors of disease interactions to evaluate the true impact of common structural interventions on overall disease prevention. The new simulation framework is especially suited to simulate behavior as a function of social determinants, and further, to identify optimal combinations of common structural and disease-specific interventions. |
2022 |
Ryan McKenna, Brett Mullins, Daniel Sheldon, Gerome Miklau AIM: an adaptive and iterative mechanism for differentially private synthetic data Inproceedings Proceedings of the VLDB Endowment, pp. 2599–2612, 2022. @inproceedings{McKenna2022AIM, title = {AIM: an adaptive and iterative mechanism for differentially private synthetic data}, author = {Ryan McKenna and Brett Mullins and Daniel Sheldon and Gerome Miklau}, doi = {https://doi.org/10.14778/3551793.3551817}, year = {2022}, date = {2022-06-01}, booktitle = {Proceedings of the VLDB Endowment}, journal = {Proceedings of the VLDB Endowment}, volume = {15}, number = {11}, pages = {2599–2612}, abstract = {We propose AIM, a new algorithm for differentially private synthetic data generation. AIM is a workload-adaptive algorithm within the paradigm of algorithms that first selects a set of queries, then privately measures those queries, and finally generates synthetic data from the noisy measurements. It uses a set of innovative features to iteratively select the most useful measurements, reflecting both their relevance to the workload and their value in approximating the input data. We also provide analytic expressions to bound per-query error with high probability which can be used to construct confidence intervals and inform users about the accuracy of generated data. We show empirically that AIM consistently outperforms a wide variety of existing mechanisms across a variety of experimental settings.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We propose AIM, a new algorithm for differentially private synthetic data generation. AIM is a workload-adaptive algorithm within the paradigm of algorithms that first selects a set of queries, then privately measures those queries, and finally generates synthetic data from the noisy measurements. It uses a set of innovative features to iteratively select the most useful measurements, reflecting both their relevance to the workload and their value in approximating the input data. We also provide analytic expressions to bound per-query error with high probability which can be used to construct confidence intervals and inform users about the accuracy of generated data. We show empirically that AIM consistently outperforms a wide variety of existing mechanisms across a variety of experimental settings. |
Dave Archer, Michael A August, Georgios Bouloukakis, Christopher Davison, Mamadou H Diallo, Dhrubajyoti Ghosh, Christopher T Graves, Michael Hay, Xi He, Peeter Laud, Steve Lu, Ashwin Machanavajjhala, Sharad Mehrotra, Gerome Miklau, Alisa Pankova, Shantanu Sharma, Nalini Venkatasubramanian, Guoxi Wang, Roberto Yus Transitioning from testbeds to ships: an experience study in deploying the TIPPERS Internet of Things platform to the US Navy Journal Article The Journal of Defense Modeling and Simulation, 19 (3), pp. 501-517, 2022. @article{Archer2022transition, title = {Transitioning from testbeds to ships: an experience study in deploying the TIPPERS Internet of Things platform to the US Navy}, author = {Dave Archer and Michael A August and Georgios Bouloukakis and Christopher Davison and Mamadou H Diallo and Dhrubajyoti Ghosh and Christopher T Graves and Michael Hay, Xi He, Peeter Laud and Steve Lu and Ashwin Machanavajjhala and Sharad Mehrotra and Gerome Miklau and Alisa Pankova and Shantanu Sharma and Nalini Venkatasubramanian and Guoxi Wang and Roberto Yus}, doi = {https://doi.org/10.1177/154851292095638}, year = {2022}, date = {2022-07-01}, journal = {The Journal of Defense Modeling and Simulation}, volume = {19}, number = {3}, pages = {501-517}, abstract = {This paper describes the collaborative effort between privacy and security researchers at nine different institutions along with researchers at the Naval Information Warfare Center to deploy, test, and demonstrate privacy-preserving technologies in creating sensor-based awareness using the Internet of Things (IoT) aboard naval vessels in the context of the US Navy’s Trident Warrior 2019 exercise. Funded by DARPA through the Brandeis program, the team built an integrated IoT data management middleware, entitled TIPPERS, that supports privacy by design and integrates a variety of Privacy Enhancing Technologies (PETs), including differential privacy, computation on encrypted data, and fine-grained policies. We describe the architecture of TIPPERS and its use in creating a smart ship that offers IoT-enabled services such as occupancy analysis, fall detection, detection of unauthorized access to spaces, and other situational awareness scenarios. We describe the privacy implications of creating IoT spaces that collect data that might include individuals’ data (e.g., location) and analyze the tradeoff between privacy and utility of the supported PETs in this context.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper describes the collaborative effort between privacy and security researchers at nine different institutions along with researchers at the Naval Information Warfare Center to deploy, test, and demonstrate privacy-preserving technologies in creating sensor-based awareness using the Internet of Things (IoT) aboard naval vessels in the context of the US Navy’s Trident Warrior 2019 exercise. Funded by DARPA through the Brandeis program, the team built an integrated IoT data management middleware, entitled TIPPERS, that supports privacy by design and integrates a variety of Privacy Enhancing Technologies (PETs), including differential privacy, computation on encrypted data, and fine-grained policies. We describe the architecture of TIPPERS and its use in creating a smart ship that offers IoT-enabled services such as occupancy analysis, fall detection, detection of unauthorized access to spaces, and other situational awareness scenarios. We describe the privacy implications of creating IoT spaces that collect data that might include individuals’ data (e.g., location) and analyze the tradeoff between privacy and utility of the supported PETs in this context. |
Sainyam Galhotra, Anna Fariha, Raoni Lourenço, Juliana Freire, Alexandra Meliou, Divesh Srivastava DataPrism: Exposing Disconnect between Data and Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022, pp. 217-231, 2022. @inproceedings{galhotra2022dataprism, title = {DataPrism: Exposing Disconnect between Data and Systems}, author = {Sainyam Galhotra and Anna Fariha and Raoni Lourenço and Juliana Freire and Alexandra Meliou and Divesh Srivastava}, doi = {https://doi.org/10.1145/3514221.3517864}, year = {2022}, date = {2022-06-01}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022}, pages = {217-231}, abstract = {As data is a central component of many modern systems, the cause of a system malfunction may reside in the data, and, specifically, particular properties of data. E.g., a health-monitoring system that is designed under the assumption that weight is reported in lbs will malfunction when encountering weight reported in kilograms. Like software debugging, which aims to find bugs in the source code or runtime conditions, our goal is to debug data to identify potential sources of disconnect between the assumptions about some data and systems that operate on that data. We propose DataPrism, a framework to identify data properties (profiles) that are the root causes of performance degradation or failure of a data-driven system. Such identification is necessary to repair data and resolve the disconnect between data and systems. Our technique is based on causal reasoning through interventions: when a system malfunctions for a dataset, DataPrism alters the data profiles and observes changes in the system's behavior due to the alteration. Unlike statistical observational analysis that reports mere correlations, DataPrism reports causally verified root causes -- in terms of data profiles -- of the system malfunction. We empirically evaluate DataPrism on seven real-world and several synthetic data-driven systems that fail on certain datasets due to a diverse set of reasons. In all cases, DataPrism identifies the root causes precisely while requiring orders of magnitude fewer interventions than prior techniques.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } As data is a central component of many modern systems, the cause of a system malfunction may reside in the data, and, specifically, particular properties of data. E.g., a health-monitoring system that is designed under the assumption that weight is reported in lbs will malfunction when encountering weight reported in kilograms. Like software debugging, which aims to find bugs in the source code or runtime conditions, our goal is to debug data to identify potential sources of disconnect between the assumptions about some data and systems that operate on that data. We propose DataPrism, a framework to identify data properties (profiles) that are the root causes of performance degradation or failure of a data-driven system. Such identification is necessary to repair data and resolve the disconnect between data and systems. Our technique is based on causal reasoning through interventions: when a system malfunctions for a dataset, DataPrism alters the data profiles and observes changes in the system's behavior due to the alteration. Unlike statistical observational analysis that reports mere correlations, DataPrism reports causally verified root causes -- in terms of data profiles -- of the system malfunction. We empirically evaluate DataPrism on seven real-world and several synthetic data-driven systems that fail on certain datasets due to a diverse set of reasons. In all cases, DataPrism identifies the root causes precisely while requiring orders of magnitude fewer interventions than prior techniques. |
Maliha Tashfia Islam, Anna Fariha, Alexandra Meliou, Babak Salimi Through the data management lens: Experimental analysis and evaluation of fair classification Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022., pp. 232-246, 2022. @inproceedings{islam2022through, title = {Through the data management lens: Experimental analysis and evaluation of fair classification}, author = {Maliha Tashfia Islam and Anna Fariha and Alexandra Meliou and Babak Salimi}, doi = {https://doi.org/10.1145/3514221.3517841}, year = {2022}, date = {2022-06-01}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022.}, pages = {232-246}, abstract = {Classification, a heavily-studied data-driven machine learning task, drives an increasing number of prediction systems involving critical human decisions such as loan approval and criminal risk assessment. However, classifiers often demonstrate discriminatory behavior, especially when presented with biased data. Consequently, fairness in classification has emerged as a high-priority research area. Data management research is showing an increasing presence and interest in topics related to data and algorithmic fairness, including the topic of fair classification. The interdisciplinary efforts in fair classification, with machine learning research having the largest presence, have resulted in a large number of fairness notions and a wide range of approaches that have not been systematically evaluated and compared. In this paper, we contribute a broad analysis of 13 fair classification approaches and additional variants, over their correctness, fairness, efficiency, scalability, robustness to data errors, sensitivity to underlying ML model, data efficiency, and stability using a variety of metrics and real-world datasets. Our analysis highlights novel insights on the impact of different metrics and high-level approach characteristics on different aspects of performance. We also discuss general principles for choosing approaches suitable for different practical settings, and identify areas where data-management-centric solutions are likely to have the most impact.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Classification, a heavily-studied data-driven machine learning task, drives an increasing number of prediction systems involving critical human decisions such as loan approval and criminal risk assessment. However, classifiers often demonstrate discriminatory behavior, especially when presented with biased data. Consequently, fairness in classification has emerged as a high-priority research area. Data management research is showing an increasing presence and interest in topics related to data and algorithmic fairness, including the topic of fair classification. The interdisciplinary efforts in fair classification, with machine learning research having the largest presence, have resulted in a large number of fairness notions and a wide range of approaches that have not been systematically evaluated and compared. In this paper, we contribute a broad analysis of 13 fair classification approaches and additional variants, over their correctness, fairness, efficiency, scalability, robustness to data errors, sensitivity to underlying ML model, data efficiency, and stability using a variety of metrics and real-world datasets. Our analysis highlights novel insights on the impact of different metrics and high-level approach characteristics on different aspects of performance. We also discuss general principles for choosing approaches suitable for different practical settings, and identify areas where data-management-centric solutions are likely to have the most impact. |
Raghavendra Addanki, Andrew McGregor, Alexandra Meliou, Zafeiria Moumoulidou Improved approximation and scalability for fair max-min diversification Inproceedings ICDT 2022, 2022. @inproceedings{addanki2022improved, title = {Improved approximation and scalability for fair max-min diversification}, author = {Raghavendra Addanki and Andrew McGregor and Alexandra Meliou and Zafeiria Moumoulidou}, url = {https://arxiv.org/abs/2201.06678}, year = {2022}, date = {2022-01-01}, booktitle = {ICDT 2022}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Wang Cen, Peter J. Haas Enhanced Simulation Metamodeling via Graph and Generative Neural Networks Inproceedings Winter Simulation Conference, WSC 2022, Singapore, December 11-14, 2022, pp. 2748-2759, 2022. @inproceedings{cen2022enhanced, title = {Enhanced Simulation Metamodeling via Graph and Generative Neural Networks}, author = {Wang Cen and Peter J. Haas}, doi = {https://doi.org/10.1109/WSC57314.2022.10015361}, year = {2022}, date = {2022-01-01}, booktitle = {Winter Simulation Conference, WSC 2022, Singapore, December 11-14, 2022}, pages = {2748-2759}, abstract = {For large, complex simulation models, simulation metamodeling is crucial for enabling simulation-based-optimization under uncertainty in operational settings where results are needed quickly. We enhance simulation metamodeling in two important ways. First, we use graph neural networks (GrNN) to allow the graphical structure of a simulation model to be treated as a metamodel input parameter that can be varied along with real-valued and integer-ordered inputs. Second, we combine GrNNs with generative neural networks so that a metamodel can rapidly produce not only a summary statistic like E[Y] , but also a sequence of i.i.d. samples of Y or even a stochastic process that mimics dynamic simulation outputs. Thus a single metamodel can be used to estimate multiple statistics for multiple performance measures. Our metamodels can potentially serve as surrogate models in digital-twin settings. Preliminary experiments indicate the promise of our approach.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } For large, complex simulation models, simulation metamodeling is crucial for enabling simulation-based-optimization under uncertainty in operational settings where results are needed quickly. We enhance simulation metamodeling in two important ways. First, we use graph neural networks (GrNN) to allow the graphical structure of a simulation model to be treated as a metamodel input parameter that can be varied along with real-valued and integer-ordered inputs. Second, we combine GrNNs with generative neural networks so that a metamodel can rapidly produce not only a summary statistic like E[Y] , but also a sequence of i.i.d. samples of Y or even a stochastic process that mimics dynamic simulation outputs. Thus a single metamodel can be used to estimate multiple statistics for multiple performance measures. Our metamodels can potentially serve as surrogate models in digital-twin settings. Preliminary experiments indicate the promise of our approach. |
Sneha Gathani, Madelon Hulsebos, James Gale, Peter J. Haas, Çağatay Demiralp Augmenting Decision Making via Interactive What-If Analysis Inproceedings 12th Conference on Innovative Data Systems Research, CIDR 2022, Chaminade, CA, USA, January 9-12, 2022, 2022. @inproceedings{gathani2021augmenting, title = {Augmenting Decision Making via Interactive What-If Analysis}, author = {Sneha Gathani and Madelon Hulsebos and James Gale and Peter J. Haas and Çağatay Demiralp }, url = {https://www.cidrdb.org/cidr2022/papers/p49-gathani.pdf}, year = {2022}, date = {2022-01-09}, booktitle = {12th Conference on Innovative Data Systems Research, CIDR 2022, Chaminade, CA, USA, January 9-12, 2022}, abstract = {The fundamental goal of business data analysis is to improve business decisions using data. Business users often make decisions to achieve key performance indicators (KPIs) such as increasing customer retention or sales, or decreasing costs. To discover the relationship between data attributes hypothesized to be drivers and those corresponding to KPIs of interest, business users currently need to perform lengthy exploratory analyses. This involves considering multitudes of combinations and scenarios and performing slicing, dicing, and transformations on the data accordingly, e.g., analyzing customer retention across quarters of the year or suggesting optimal media channels across strata of customers. However, the increasing complexity of datasets combined with the cognitive limitations of humans makes it challenging to carry over multiple hypotheses, even for simple datasets. Therefore mentally performing such analyses is hard. Existing commercial tools either provide partial solutions or fail to cater to business users altogether. Here we argue for four functionalities to enable business users to interactively learn and reason about the relationships between sets of data attributes thereby facilitating data-driven decision making. We implement these functionalities in SystemD, an interactive visual data analysis system enabling business users to experiment with the data by asking what-if questions. We evaluate the system through three business use cases: marketing mix modeling, customer retention analysis, and deal closing analysis, and report on feedback from multiple business users. Users find the SystemD functionalities highly useful for quick testing and validation of their hypotheses around their KPIs of interest, addressing their unmet analysis needs. The feedback also suggests that the UX design can be enhanced to further improve the understandability of these functionalities.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The fundamental goal of business data analysis is to improve business decisions using data. Business users often make decisions to achieve key performance indicators (KPIs) such as increasing customer retention or sales, or decreasing costs. To discover the relationship between data attributes hypothesized to be drivers and those corresponding to KPIs of interest, business users currently need to perform lengthy exploratory analyses. This involves considering multitudes of combinations and scenarios and performing slicing, dicing, and transformations on the data accordingly, e.g., analyzing customer retention across quarters of the year or suggesting optimal media channels across strata of customers. However, the increasing complexity of datasets combined with the cognitive limitations of humans makes it challenging to carry over multiple hypotheses, even for simple datasets. Therefore mentally performing such analyses is hard. Existing commercial tools either provide partial solutions or fail to cater to business users altogether. Here we argue for four functionalities to enable business users to interactively learn and reason about the relationships between sets of data attributes thereby facilitating data-driven decision making. We implement these functionalities in SystemD, an interactive visual data analysis system enabling business users to experiment with the data by asking what-if questions. We evaluate the system through three business use cases: marketing mix modeling, customer retention analysis, and deal closing analysis, and report on feedback from multiple business users. Users find the SystemD functionalities highly useful for quick testing and validation of their hypotheses around their KPIs of interest, addressing their unmet analysis needs. The feedback also suggests that the UX design can be enhanced to further improve the understandability of these functionalities. |
Azza Abouzied, Peter J Haas, Alexandra Meliou In-Database Decision Support: Opportunities and Challenges Journal Article IEEE Data Engineering Bulletin, 45 (3), pp. 102-115, 2022. @article{abouzied2022database, title = {In-Database Decision Support: Opportunities and Challenges}, author = {Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {https://people.cs.umass.edu/~phaas/files/DEBulletin2022.pdf}, year = {2022}, date = {2022-09-01}, journal = {IEEE Data Engineering Bulletin}, volume = {45}, number = {3}, pages = {102-115}, abstract = {Decision makers in a broad range of domains, such as finance, transportation, manufacturing, and healthcare, often need to derive optimal decisions given a set of constraints and objectives. Traditional solutions to such constrained optimization problems are typically application-specific, complex, and do not generalize. Further, the usual workflow requires slow, cumbersome, and error-prone data movement between a database and predictive-modeling and optimization packages. All of these problems are ex- acerbated by the unprecedented size of modern data-intensive optimization problems. The emerging re- search area of in-database prescriptive analytics aims to provide seamless domain-independent, declar- ative, and scalable approaches powered by the system where the data typically resides: the database. Integrating optimization with database technology opens up prescriptive analytics to a much broader community, amplifying its benefits. In the context of our prior and ongoing work in this area, we discuss some strategies for addressing key challenges related to usability, scalability, data uncertainty, dynamic environments with changing data and models, and the need to support decision-making agents. We in- dicate how deep integration between the DBMS, predictive models, and optimization software creates opportunities for rich prescriptive-query functionality with good scalability and performance.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Decision makers in a broad range of domains, such as finance, transportation, manufacturing, and healthcare, often need to derive optimal decisions given a set of constraints and objectives. Traditional solutions to such constrained optimization problems are typically application-specific, complex, and do not generalize. Further, the usual workflow requires slow, cumbersome, and error-prone data movement between a database and predictive-modeling and optimization packages. All of these problems are ex- acerbated by the unprecedented size of modern data-intensive optimization problems. The emerging re- search area of in-database prescriptive analytics aims to provide seamless domain-independent, declar- ative, and scalable approaches powered by the system where the data typically resides: the database. Integrating optimization with database technology opens up prescriptive analytics to a much broader community, amplifying its benefits. In the context of our prior and ongoing work in this area, we discuss some strategies for addressing key challenges related to usability, scalability, data uncertainty, dynamic environments with changing data and models, and the need to support decision-making agents. We in- dicate how deep integration between the DBMS, predictive models, and optimization software creates opportunities for rich prescriptive-query functionality with good scalability and performance. |
2021 |
Fei Song, Khaled Zaouk, Chenghao Lyu, Arnab Sinha, Qi Fan, Yanlei Diao, Prashant J Shenoy Spark-based Cloud Data Analytics using Multi-Objective Optimization Inproceedings 37th IEEE International Conference on Data Engineering, ICDE 2021, Chania, Greece, April 19-22, 2021, pp. 396–407, IEEE, 2021. @inproceedings{DBLP:conf/icde/SongZLSFDS21, title = {Spark-based Cloud Data Analytics using Multi-Objective Optimization}, author = {Fei Song and Khaled Zaouk and Chenghao Lyu and Arnab Sinha and Qi Fan and Yanlei Diao and Prashant J Shenoy}, url = {https://doi.org/10.1109/ICDE51399.2021.00041 http://scalla.cs.umass.edu/papers/udao2020.pdf}, doi = {10.1109/ICDE51399.2021.00041}, year = {2021}, date = {2021-01-01}, booktitle = {37th IEEE International Conference on Data Engineering, ICDE 2021, Chania, Greece, April 19-22, 2021}, pages = {396--407}, publisher = {IEEE}, abstract = {Data analytics in the cloud has become an integral part of enterprise businesses. Big data analytics systems, however, still lack the ability to take task objectives such as user performance goals and budgetary constraints and automatically configure an analytic job to achieve these objectives. This paper presents UDAO, a Spark-based Unified Data Analytics Optimizer that can automatically determine a cluster configuration with a suitable number of cores as well as other system parameters that best meet the task objectives. At a core of our work is a principled multi-objective optimization (MOO) approach that computes a Pareto optimal set of configurations to reveal tradeoffs between different objectives, recommends a new Spark configuration that best explores such tradeoffs, and employs novel optimizations to enable such recommendations within a few seconds. Detailed experiments using benchmark workloads show that our MOO techniques provide a 2-50x speedup over existing MOO methods, while offering good coverage of the Pareto frontier. Compared to Ottertune, a state-of-the-art performance tuning system, UDAO recommends Spark configurations that yield 26%-49% reduction of running time of the TPCx-BB benchmark while adapting to different user preferences on multiple objectives.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data analytics in the cloud has become an integral part of enterprise businesses. Big data analytics systems, however, still lack the ability to take task objectives such as user performance goals and budgetary constraints and automatically configure an analytic job to achieve these objectives. This paper presents UDAO, a Spark-based Unified Data Analytics Optimizer that can automatically determine a cluster configuration with a suitable number of cores as well as other system parameters that best meet the task objectives. At a core of our work is a principled multi-objective optimization (MOO) approach that computes a Pareto optimal set of configurations to reveal tradeoffs between different objectives, recommends a new Spark configuration that best explores such tradeoffs, and employs novel optimizations to enable such recommendations within a few seconds. Detailed experiments using benchmark workloads show that our MOO techniques provide a 2-50x speedup over existing MOO methods, while offering good coverage of the Pareto frontier. Compared to Ottertune, a state-of-the-art performance tuning system, UDAO recommends Spark configurations that yield 26%-49% reduction of running time of the TPCx-BB benchmark while adapting to different user preferences on multiple objectives. |
Anna Fariha, Ashish Tiwari, Alexandra Meliou, Arjun Radhakrishna, Sumit Gulwani CoCo: Interactive Exploration of Conformance Constraints for Data Understanding and Data Cleaning Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. @inproceedings{FarihaTMRG21demo, title = {CoCo: Interactive Exploration of Conformance Constraints for Data Understanding and Data Cleaning}, author = {Anna Fariha and Ashish Tiwari and Alexandra Meliou and Arjun Radhakrishna and Sumit Gulwani}, url = {https://afariha.github.io/papers/CoCo_SIGMOD_2021_Demo.pdf}, doi = {10.1145/3448016.3452750}, year = {2021}, date = {2021-06-18}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, abstract = {Data profiling refers to the task of extracting technical metadata or profiles and has numerous applications such as data understanding, validation, integration, and cleaning. While a number of data profiling primitives exist in the literature, most of them are limited to categorical attributes. A few techniques consider numerical attributes; but, they either focus on simple relationships involving a pair of attributes (e.g., correlations) or convert the continuous semantics of numerical attributes to a discrete semantics, which results in information loss. To capture more complex relationships involving the numerical attributes, we developed a new data-profiling primitive called conformance constraints, which can model linear arithmetic relationships involving multiple numerical attributes. We present CoCo, a system that allows interactive discovery and exploration of Conformance Constraints for understanding trends involving the numerical attributes of a dataset, with a particular focus on the application of data cleaning. Through a simple interface, CoCo enables the user to guide conformance constraint discovery according to their preferences. The user can examine to what extent a new, possibly dirty, dataset satisfies or violates the discovered conformance constraints. Further, CoCo provides useful suggestions for cleaning dirty data tuples, where the user can interactively alter cell values, and verify by checking change in conformance constraint violation due to the alteration. We demonstrate how CoCo can help in understanding trends in the data and assist the users in interactive data cleaning, using conformance constraints}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data profiling refers to the task of extracting technical metadata or profiles and has numerous applications such as data understanding, validation, integration, and cleaning. While a number of data profiling primitives exist in the literature, most of them are limited to categorical attributes. A few techniques consider numerical attributes; but, they either focus on simple relationships involving a pair of attributes (e.g., correlations) or convert the continuous semantics of numerical attributes to a discrete semantics, which results in information loss. To capture more complex relationships involving the numerical attributes, we developed a new data-profiling primitive called conformance constraints, which can model linear arithmetic relationships involving multiple numerical attributes. We present CoCo, a system that allows interactive discovery and exploration of Conformance Constraints for understanding trends involving the numerical attributes of a dataset, with a particular focus on the application of data cleaning. Through a simple interface, CoCo enables the user to guide conformance constraint discovery according to their preferences. The user can examine to what extent a new, possibly dirty, dataset satisfies or violates the discovered conformance constraints. Further, CoCo provides useful suggestions for cleaning dirty data tuples, where the user can interactively alter cell values, and verify by checking change in conformance constraint violation due to the alteration. We demonstrate how CoCo can help in understanding trends in the data and assist the users in interactive data cleaning, using conformance constraints |
Anna Fariha, Ashish Tiwari, Arjun Radhakrishna, Sumit Gulwani, Alexandra Meliou Conformance Constraint Discovery: Measuring Trust in Data-Driven Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. @inproceedings{FarihaTMRG21b, title = {Conformance Constraint Discovery: Measuring Trust in Data-Driven Systems}, author = {Anna Fariha and Ashish Tiwari and Arjun Radhakrishna and Sumit Gulwani and Alexandra Meliou}, url = {https://afariha.github.io/papers/Conformance_Constraints_SIGMOD_2021.pdf}, doi = {10.1145/3448016.3452795}, year = {2021}, date = {2021-06-18}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, abstract = {The reliability of inferences made by data-driven systems hinges on the data’s continued conformance to the systems’ initial settings and assumptions. When serving data (on which we want to apply inference) deviates from the profile of the initial training data, the outcome of inference becomes unreliable. We introduce conformance constraints, a new data profiling primitive tailored towards quantifying the degree of non-conformance, which can effectively characterize if inference over that tuple is untrustworthy. Conformance constraints are constraints over certain arithmetic expressions (called projections) involving the numerical attributes of a dataset, which existing data profiling primitives such as functional dependencies and denial constraints cannot model. Our key finding is that projections that incur low variance on a dataset construct effective conformance constraints. This principle yields the surprising result that lowvariance components of a principal component analysis, which are usually discarded for dimensionality reduction, generate stronger conformance constraints than the high-variance components. Based on this result, we provide a highly scalable and efficient technique—linear in data size and cubic in the number of attributes—for discovering conformance constraints for a dataset. To measure the degree of a tuple’s non-conformance with respect to a dataset, we propose a quantitative semantics that captures how much a tuple violates the conformance constraints of that dataset. We demonstrate the value of conformance constraints on two applications: trusted machine learning and data drift. We empirically show that conformance constraints offer mechanisms to (1) reliably detect tuples on which the inference of a machine-learned model should not be trusted, and (2) quantify data drift more accurately than the state of the art.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The reliability of inferences made by data-driven systems hinges on the data’s continued conformance to the systems’ initial settings and assumptions. When serving data (on which we want to apply inference) deviates from the profile of the initial training data, the outcome of inference becomes unreliable. We introduce conformance constraints, a new data profiling primitive tailored towards quantifying the degree of non-conformance, which can effectively characterize if inference over that tuple is untrustworthy. Conformance constraints are constraints over certain arithmetic expressions (called projections) involving the numerical attributes of a dataset, which existing data profiling primitives such as functional dependencies and denial constraints cannot model. Our key finding is that projections that incur low variance on a dataset construct effective conformance constraints. This principle yields the surprising result that lowvariance components of a principal component analysis, which are usually discarded for dimensionality reduction, generate stronger conformance constraints than the high-variance components. Based on this result, we provide a highly scalable and efficient technique—linear in data size and cubic in the number of attributes—for discovering conformance constraints for a dataset. To measure the degree of a tuple’s non-conformance with respect to a dataset, we propose a quantitative semantics that captures how much a tuple violates the conformance constraints of that dataset. We demonstrate the value of conformance constraints on two applications: trusted machine learning and data drift. We empirically show that conformance constraints offer mechanisms to (1) reliably detect tuples on which the inference of a machine-learned model should not be trusted, and (2) quantify data drift more accurately than the state of the art. |
Zafeiria Moumoulidou, Andrew McGregor, Alexandra Meliou Diverse Data Selection under Fairness Constraints Inproceedings International Conference on Database Theory, (ICDT), pp. 11:1–11:25, 2021. @inproceedings{MoumoulidouMM21, title = {Diverse Data Selection under Fairness Constraints}, author = {Zafeiria Moumoulidou and Andrew McGregor and Alexandra Meliou}, url = {https://arxiv.org/pdf/2010.09141.pdf}, year = {2021}, date = {2021-03-23}, booktitle = {International Conference on Database Theory, (ICDT)}, pages = {11:1--11:25}, abstract = {Diversity is an important principle in data selection and summarization, facility location, and recommendation systems. Our work focuses on maximizing diversity in data selection, while offering fairness guarantees. In particular, we offer the first study that augments the Max-Min diversification objective with fairness constraints. More specifically, given a universe U of n elements that can be partitioned into m disjoint groups, we aim to retrieve a k-sized subset that maximizes the pairwise minimum distance within the set (diversity) and contains a pre-specified ki number of elements from each group i (fairness). We show that this problem is NP-complete even in metric spaces, and we propose three novel algorithms, linear in n, that provide strong theoretical approximation guarantees for different values of m and k. Finally, we extend our algorithms and analysis to the case where groups can be overlapping.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Diversity is an important principle in data selection and summarization, facility location, and recommendation systems. Our work focuses on maximizing diversity in data selection, while offering fairness guarantees. In particular, we offer the first study that augments the Max-Min diversification objective with fairness constraints. More specifically, given a universe U of n elements that can be partitioned into m disjoint groups, we aim to retrieve a k-sized subset that maximizes the pairwise minimum distance within the set (diversity) and contains a pre-specified ki number of elements from each group i (fairness). We show that this problem is NP-complete even in metric spaces, and we propose three novel algorithms, linear in n, that provide strong theoretical approximation guarantees for different values of m and k. Finally, we extend our algorithms and analysis to the case where groups can be overlapping. |
Abhinav Jangda, Sandeep Polisetty, Arjun Guha, Marco Serafini Accelerating graph sampling for graph machine learning using GPUs Inproceedings EuroSys '21: Proceedings of the Sixteenth European Conference on Computer Systems, pp. 311–326, 2021. @inproceedings{Jangda2021accelerating, title = {Accelerating graph sampling for graph machine learning using GPUs}, author = {Abhinav Jangda and Sandeep Polisetty and Arjun Guha and Marco Serafini}, doi = {https://doi.org/10.1145/3447786.3456244}, year = {2021}, date = {2021-04-01}, booktitle = {EuroSys '21: Proceedings of the Sixteenth European Conference on Computer Systems}, pages = {311–326}, abstract = {Representation learning algorithms automatically learn the features of data. Several representation learning algorithms for graph data, such as DeepWalk, node2vec, and Graph-SAGE, sample the graph to produce mini-batches that are suitable for training a DNN. However, sampling time can be a significant fraction of training time, and existing systems do not efficiently parallelize sampling. Sampling is an "embarrassingly parallel" problem and may appear to lend itself to GPU acceleration, but the irregularity of graphs makes it hard to use GPU resources effectively. This paper presents NextDoor, a system designed to effectively perform graph sampling on GPUs. NextDoor employs a new approach to graph sampling that we call transit-parallelism, which allows load balancing and caching of edges. NextDoor provides end-users with a high-level abstraction for writing a variety of graph sampling algorithms. We implement several graph sampling applications, and show that NextDoor runs them orders of magnitude faster than existing systems.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Representation learning algorithms automatically learn the features of data. Several representation learning algorithms for graph data, such as DeepWalk, node2vec, and Graph-SAGE, sample the graph to produce mini-batches that are suitable for training a DNN. However, sampling time can be a significant fraction of training time, and existing systems do not efficiently parallelize sampling. Sampling is an "embarrassingly parallel" problem and may appear to lend itself to GPU acceleration, but the irregularity of graphs makes it hard to use GPU resources effectively. This paper presents NextDoor, a system designed to effectively perform graph sampling on GPUs. NextDoor employs a new approach to graph sampling that we call transit-parallelism, which allows load balancing and caching of edges. NextDoor provides end-users with a high-level abstraction for writing a variety of graph sampling algorithms. We implement several graph sampling applications, and show that NextDoor runs them orders of magnitude faster than existing systems. |
Yifei Yang, Matt Youill, Matthew Woicik, Yizhou Liu, Xiangyao Yu, Marco Serafini, Ashraf Aboulnaga, Michael Stonebraker FlexPushdownDB: Hybrid Pushdown and Caching in a Cloud DBMS Inproceedings Proceedings of the VLDB Endowment, pp. 2101–2113, 2021. @inproceedings{yang2021flexpushdowndb, title = {FlexPushdownDB: Hybrid Pushdown and Caching in a Cloud DBMS}, author = {Yifei Yang and Matt Youill and Matthew Woicik and Yizhou Liu and Xiangyao Yu and Marco Serafini and Ashraf Aboulnaga and Michael Stonebraker}, doi = {https://doi.org/10.14778/3476249.3476265}, year = {2021}, date = {2021-06-01}, booktitle = {Proceedings of the VLDB Endowment}, volume = {14}, number = {11}, pages = { 2101–2113}, abstract = {Modern cloud databases adopt a storage-disaggregation architecture that separates the management of computation and storage. A major bottleneck in such an architecture is the network connecting the computation and storage layers. Two solutions have been explored to mitigate the bottleneck: caching and computation pushdown. While both techniques can significantly reduce network traffic, existing DBMSs consider them as orthogonal techniques and support only one or the other, leaving potential performance benefits unexploited. In this paper we present FlexPushdownDB (FPDB), an OLAP cloud DBMS prototype that supports fine-grained hybrid query execution to combine the benefits of caching and computation pushdown in a storage-disaggregation architecture. We build a hybrid query executor based on a new concept called separable operators to combine the data from the cache and results from the pushdown processing. We also propose a novel Weighted-LFU cache replacement policy that takes into account the cost of pushdown computation. Our experimental evaluation on the Star Schema Benchmark shows that the hybrid execution outperforms both the conventional caching-only architecture and pushdown-only architecture by 2.2X. In the hybrid architecture, our experiments show that Weighted-LFU can outperform the baseline LFU by 37%}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Modern cloud databases adopt a storage-disaggregation architecture that separates the management of computation and storage. A major bottleneck in such an architecture is the network connecting the computation and storage layers. Two solutions have been explored to mitigate the bottleneck: caching and computation pushdown. While both techniques can significantly reduce network traffic, existing DBMSs consider them as orthogonal techniques and support only one or the other, leaving potential performance benefits unexploited. In this paper we present FlexPushdownDB (FPDB), an OLAP cloud DBMS prototype that supports fine-grained hybrid query execution to combine the benefits of caching and computation pushdown in a storage-disaggregation architecture. We build a hybrid query executor based on a new concept called separable operators to combine the data from the cache and results from the pushdown processing. We also propose a novel Weighted-LFU cache replacement policy that takes into account the cost of pushdown computation. Our experimental evaluation on the Star Schema Benchmark shows that the hybrid execution outperforms both the conventional caching-only architecture and pushdown-only architecture by 2.2X. In the hybrid architecture, our experiments show that Weighted-LFU can outperform the baseline LFU by 37% |
Ryan McKenna, Siddhant Pradhan, Daniel Sheldon, Gerome Miklau Relaxed marginal consistency for differentially private query answering Inproceedings Advances in Neural Information Processing Systems, pp. 20696-20707, 2021. @inproceedings{McKenna2021relaxed, title = {Relaxed marginal consistency for differentially private query answering}, author = {Ryan McKenna and Siddhant Pradhan and Daniel Sheldon and Gerome Miklau}, url = {https://proceedings.neurips.cc/paper/2021/hash/acb55f9af76808c5fd5522dcdb519fde-Abstract.html}, year = {2021}, date = {2021-12-01}, booktitle = {Advances in Neural Information Processing Systems}, volume = {34}, pages = {20696-20707}, abstract = {Many differentially private algorithms for answering database queries involve astep that reconstructs a discrete data distribution from noisy measurements. Thisprovides consistent query answers and reduces error, but often requires space thatgrows exponentially with dimension. PRIVATE-PGM is a recent approach that usesgraphical models to represent the data distribution, with complexity proportional tothat of exact marginal inference in a graphical model with structure determined bythe co-occurrence of variables in the noisy measurements. PRIVATE-PGM is highlyscalable for sparse measurements, but may fail to run in high dimensions with densemeasurements. We overcome the main scalability limitation of PRIVATE-PGMthrough a principled approach that relaxes consistency constraints in the estimationobjective. Our new approach works with many existing private query answeringalgorithms and improves scalability or accuracy with no privacy cost.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Many differentially private algorithms for answering database queries involve astep that reconstructs a discrete data distribution from noisy measurements. Thisprovides consistent query answers and reduces error, but often requires space thatgrows exponentially with dimension. PRIVATE-PGM is a recent approach that usesgraphical models to represent the data distribution, with complexity proportional tothat of exact marginal inference in a graphical model with structure determined bythe co-occurrence of variables in the noisy measurements. PRIVATE-PGM is highlyscalable for sparse measurements, but may fail to run in high dimensions with densemeasurements. We overcome the main scalability limitation of PRIVATE-PGMthrough a principled approach that relaxes consistency constraints in the estimationobjective. Our new approach works with many existing private query answeringalgorithms and improves scalability or accuracy with no privacy cost. |
Ryan McKenna, Gerome Miklau, Daniel Sheldon Winning the NIST Contest: A scalable and general approach to differentially private synthetic data Journal Article Journal of Privacy and Confidentiality, 11 (3), 2021. @article{McKenna2021winning, title = {Winning the NIST Contest: A scalable and general approach to differentially private synthetic data}, author = {Ryan McKenna and Gerome Miklau and Daniel Sheldon}, doi = {https://doi.org/10.29012/jpc.778}, year = {2021}, date = {2021-12-01}, journal = {Journal of Privacy and Confidentiality}, volume = {11}, number = {3}, abstract = {We propose a general approach for differentially private synthetic data generation, that consists of three steps: (1) select a collection of low-dimensional marginals, (2) measure those marginals with a noise addition mechanism, and (3) generate synthetic data that preserves the measured marginals well. Central to this approach is Private-PGM, a post-processing method that is used to estimate a high-dimensional data distribution from noisy measurements of its marginals. We present two mechanisms, NIST-MST and MST, that are instances of this general approach. NIST-MST was the winning mechanism in the 2018 NIST differential privacy synthetic data competition, and MST is a new mechanism that can work in more general settings, while still performing comparably to NIST-MST. We believe our general approach should be of broad interest, and can be adopted in future mechanisms for synthetic data generation.}, keywords = {}, pubstate = {published}, tppubtype = {article} } We propose a general approach for differentially private synthetic data generation, that consists of three steps: (1) select a collection of low-dimensional marginals, (2) measure those marginals with a noise addition mechanism, and (3) generate synthetic data that preserves the measured marginals well. Central to this approach is Private-PGM, a post-processing method that is used to estimate a high-dimensional data distribution from noisy measurements of its marginals. We present two mechanisms, NIST-MST and MST, that are instances of this general approach. NIST-MST was the winning mechanism in the 2018 NIST differential privacy synthetic data competition, and MST is a new mechanism that can work in more general settings, while still performing comparably to NIST-MST. We believe our general approach should be of broad interest, and can be adopted in future mechanisms for synthetic data generation. |
Marco Serafini, Hui Guan Scalable graph neural network training: The case for sampling Journal Article ACM SIGOPS Operating Systems Review, 55 (1), pp. 68-76, 2021. @article{serafini2021scalable, title = {Scalable graph neural network training: The case for sampling}, author = {Marco Serafini and Hui Guan}, doi = {https://doi.org/10.1145/3469379.3469387}, year = {2021}, date = {2021-01-01}, journal = {ACM SIGOPS Operating Systems Review}, volume = {55}, number = {1}, pages = {68-76}, abstract = {Graph Neural Networks (GNNs) are a new and increasingly popular family of deep neural network architectures to perform learning on graphs. Training them efficiently is challenging due to the irregular nature of graph data. The problem becomes even more challenging when scaling to large graphs that exceed the capacity of single devices. Standard approaches to distributed DNN training, like data and model parallelism, do not directly apply to GNNs. Instead, two different approaches have emerged in the literature: whole-graph and sample-based training. In this paper, we review and compare the two approaches. Scalability is challenging with both approaches, but we make a case that research should focus on sample-based training since it is a more promising approach. Finally, we review recent systems supporting sample-based training.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Graph Neural Networks (GNNs) are a new and increasingly popular family of deep neural network architectures to perform learning on graphs. Training them efficiently is challenging due to the irregular nature of graph data. The problem becomes even more challenging when scaling to large graphs that exceed the capacity of single devices. Standard approaches to distributed DNN training, like data and model parallelism, do not directly apply to GNNs. Instead, two different approaches have emerged in the literature: whole-graph and sample-based training. In this paper, we review and compare the two approaches. Scalability is challenging with both approaches, but we make a case that research should focus on sample-based training since it is a more promising approach. Finally, we review recent systems supporting sample-based training. |
2020 |
Matteo Brucato, Nishant Yadav, Azza Abouzied, Peter J Haas, Alexandra Meliou Stochastic Package Queries in Probabilistic Databases Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 269–283, 2020. @inproceedings{DBLP:conf/sigmod/BrucatoYAHM20, title = {Stochastic Package Queries in Probabilistic Databases}, author = {Matteo Brucato and Nishant Yadav and Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {https://doi.org/10.1145/3318464.3389765 https://people.cs.umass.edu/~matteo/files/3318464.3389765.pdf}, doi = {10.1145/3318464.3389765}, year = {2020}, date = {2020-06-15}, booktitle = {Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020}, pages = {269--283}, crossref = {DBLP:conf/sigmod/2020}, abstract = {We provide methods for in-database support of decision making under uncertainty. Many important decision problems correspond to selecting a "package" (bag of tuples in a relational database) that jointly satisfy a set of constraints while minimizing some overall "cost" function; in most real-world problems, the data is uncertain. We provide methods for specifying---via a SQL extension---and processing stochastic package queries (SPQS), in order to solve optimization problems over uncertain data, right where the data resides. Prior work in stochastic programming uses Monte Carlo methods where the original stochastic optimization problem is approximated by a large deterministic optimization problem that incorporates many "scenarios", i.e., sample realizations of the uncertain data values. For large database tables, however, a huge number of scenarios is required, leading to poor performance and, often, failure of the solver software. We therefore provide a novel ßs algorithm that, instead of trying to solve a large deterministic problem, seamlessly approximates it via a sequence of smaller problems defined over carefully crafted "summaries" of the scenarios that accelerate convergence to a feasible and near-optimal solution. Experimental results on our prototype system show that ßs can be orders of magnitude faster than prior methods at finding feasible and high-quality packages.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We provide methods for in-database support of decision making under uncertainty. Many important decision problems correspond to selecting a "package" (bag of tuples in a relational database) that jointly satisfy a set of constraints while minimizing some overall "cost" function; in most real-world problems, the data is uncertain. We provide methods for specifying---via a SQL extension---and processing stochastic package queries (SPQS), in order to solve optimization problems over uncertain data, right where the data resides. Prior work in stochastic programming uses Monte Carlo methods where the original stochastic optimization problem is approximated by a large deterministic optimization problem that incorporates many "scenarios", i.e., sample realizations of the uncertain data values. For large database tables, however, a huge number of scenarios is required, leading to poor performance and, often, failure of the solver software. We therefore provide a novel ßs algorithm that, instead of trying to solve a large deterministic problem, seamlessly approximates it via a sequence of smaller problems defined over carefully crafted "summaries" of the scenarios that accelerate convergence to a feasible and near-optimal solution. Experimental results on our prototype system show that ßs can be orders of magnitude faster than prior methods at finding feasible and high-quality packages. |
Dan Zhang, Ryan McKenna, Ios Kotsogiannis, George Bissias, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau ϵKTELO: A Framework for Defining Differentially Private Computations Journal Article ACM Trans. Database Syst., 45 (1), pp. 2:1–2:44, 2020. @article{DBLP:journals/tods/ZhangMKBHMM20, title = {ϵKTELO: A Framework for Defining Differentially Private Computations}, author = {Dan Zhang and Ryan McKenna and Ios Kotsogiannis and George Bissias and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3362032}, doi = {10.1145/3362032}, year = {2020}, date = {2020-01-01}, journal = {ACM Trans. Database Syst.}, volume = {45}, number = {1}, pages = {2:1--2:44}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
David Pujol, Ryan McKenna, Satya Kuppam, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau Fair decision making using privacy-protected data Inproceedings FAT* '20: Conference on Fairness, Accountability, and Transparency, Barcelona, Spain, January 27-30, 2020, pp. 189–199, 2020. @inproceedings{DBLP:conf/fat/PujolMKHMM20, title = {Fair decision making using privacy-protected data}, author = {David Pujol and Ryan McKenna and Satya Kuppam and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3351095.3372872}, doi = {10.1145/3351095.3372872}, year = {2020}, date = {2020-01-01}, booktitle = {FAT* '20: Conference on Fairness, Accountability, and Transparency, Barcelona, Spain, January 27-30, 2020}, pages = {189--199}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Anna Fariha, Suman Nath, Alexandra Meliou Causality-Guided Adaptive Interventional Debugging Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 431–446, 2020. @inproceedings{DBLP:conf/sigmod/FarihaNM20, title = {Causality-Guided Adaptive Interventional Debugging}, author = {Anna Fariha and Suman Nath and Alexandra Meliou}, url = {https://doi.org/10.1145/3318464.3389694 https://people.cs.umass.edu/~afariha/papers/aid.pdf}, doi = {10.1145/3318464.3389694}, year = {2020}, date = {2020-06-15}, booktitle = {Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020}, pages = {431--446}, crossref = {DBLP:conf/sigmod/2020}, abstract = {Runtime nondeterminism is a fact of life in modern database applications. Previous research has shown that nondeterminism can cause applications to intermittently crash, become unresponsive, or experience data corruption. We propose Adaptive Interventional Debugging (AID) for debugging such intermittent failures. AID combines existing statistical debugging, causal analysis, fault injection, and group testing techniques in a novel way to (1) pinpoint the root cause of an application's intermittent failure and (2) generate an explanation of how the root cause triggers the failure. AID works by first identifying a set of runtime behaviors (called predicates) that are strongly correlated to the failure. It then utilizes temporal properties of the predicates to (over)-approximate their causal relationships. Finally, it uses fault injection to execute a sequence of interventions on the predicates and discover their true causal relationships. This enables AID to identify the true root cause and its causal relationship to the failure. We theoretically analyze how fast AID can converge to the identification. We evaluate AID with six real-world applications that intermittently fail under specific inputs. In each case, AID was able to identify the root cause and explain how the root cause triggered the failure, much faster than group testing and more precisely than statistical debugging. We also evaluate AID with many synthetically generated applications with known root causes and confirm that the benefits also hold for them.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Runtime nondeterminism is a fact of life in modern database applications. Previous research has shown that nondeterminism can cause applications to intermittently crash, become unresponsive, or experience data corruption. We propose Adaptive Interventional Debugging (AID) for debugging such intermittent failures. AID combines existing statistical debugging, causal analysis, fault injection, and group testing techniques in a novel way to (1) pinpoint the root cause of an application's intermittent failure and (2) generate an explanation of how the root cause triggers the failure. AID works by first identifying a set of runtime behaviors (called predicates) that are strongly correlated to the failure. It then utilizes temporal properties of the predicates to (over)-approximate their causal relationships. Finally, it uses fault injection to execute a sequence of interventions on the predicates and discover their true causal relationships. This enables AID to identify the true root cause and its causal relationship to the failure. We theoretically analyze how fast AID can converge to the identification. We evaluate AID with six real-world applications that intermittently fail under specific inputs. In each case, AID was able to identify the root cause and explain how the root cause triggered the failure, much faster than group testing and more precisely than statistical debugging. We also evaluate AID with many synthetically generated applications with known root causes and confirm that the benefits also hold for them. |
Anna Fariha, Ashish Tiwari, Arjun Radhakrishna, Sumit Gulwani ExTuNe: Explaining Tuple Non-conformance Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 2741–2744, 2020. @inproceedings{DBLP:conf/sigmod/Fariha0RG20, title = {ExTuNe: Explaining Tuple Non-conformance}, author = {Anna Fariha and Ashish Tiwari and Arjun Radhakrishna and Sumit Gulwani}, url = {https://doi.org/10.1145/3318464.3384694 https://people.cs.umass.edu/~afariha/papers/ExTuNe.pdf}, doi = {10.1145/3318464.3384694}, year = {2020}, date = {2020-06-16}, booktitle = {Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020}, pages = {2741--2744}, crossref = {DBLP:conf/sigmod/2020}, abstract = {In data-driven systems, we often encounter tuples on which the predictions of a machine-learned model are untrustworthy. A key cause of such untrustworthiness is non-conformance of a new tuple with respect to the training dataset. To check conformance, we introduce a novel concept of data invariant, which captures a set of implicit constraints that all tuples of a dataset satisfy: a test tuple is non-conforming if it violates the data invariants. Data invariants model complex relationships among multiple attributes; but do not provide interpretable explanations of non-conformance. We present ExTuNe, a system for Explaining causes of Tuple Non-conformance. Based on the principles of causality, ExTuNe assigns responsibility to the attributes for causing non-conformance. The key idea is to observe change in invariant violation under intervention on attribute-values. Through a simple interface, ExTuNe produces a ranked list of the test tuples based on their degree of non-conformance and visualizes tuple-level attribute responsibility for non-conformance through heat maps. ExTuNe further visualizes attribute responsibility, aggregated over the test tuples. We demonstrate how ExTuNe can detect and explain tuple non-conformance and assist the users to make careful decisions towards achieving trusted machine learning.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In data-driven systems, we often encounter tuples on which the predictions of a machine-learned model are untrustworthy. A key cause of such untrustworthiness is non-conformance of a new tuple with respect to the training dataset. To check conformance, we introduce a novel concept of data invariant, which captures a set of implicit constraints that all tuples of a dataset satisfy: a test tuple is non-conforming if it violates the data invariants. Data invariants model complex relationships among multiple attributes; but do not provide interpretable explanations of non-conformance. We present ExTuNe, a system for Explaining causes of Tuple Non-conformance. Based on the principles of causality, ExTuNe assigns responsibility to the attributes for causing non-conformance. The key idea is to observe change in invariant violation under intervention on attribute-values. Through a simple interface, ExTuNe produces a ranked list of the test tuples based on their degree of non-conformance and visualizes tuple-level attribute responsibility for non-conformance through heat maps. ExTuNe further visualizes attribute responsibility, aggregated over the test tuples. We demonstrate how ExTuNe can detect and explain tuple non-conformance and assist the users to make careful decisions towards achieving trusted machine learning. |
Cibele Freire, Wolfgang Gatterbauer, Neil Immerman, Alexandra Meliou New Results for the Complexity of Resilience for Binary Conjunctive Queries with Self-Joins Inproceedings Proceedings of the 39th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems, PODS 2020, Portland, OR, USA, June 14-19, 2020, pp. 271–284, 2020. @inproceedings{DBLP:conf/pods/FreireGIM20, title = {New Results for the Complexity of Resilience for Binary Conjunctive Queries with Self-Joins}, author = {Cibele Freire and Wolfgang Gatterbauer and Neil Immerman and Alexandra Meliou}, url = {https://doi.org/10.1145/3375395.3387647 https://people.cs.umass.edu/~ameli/projects/causality/papers/pods2020-Resilience.pdf}, doi = {10.1145/3375395.3387647}, year = {2020}, date = {2020-06-16}, booktitle = {Proceedings of the 39th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems, PODS 2020, Portland, OR, USA, June 14-19, 2020}, pages = {271--284}, crossref = {DBLP:conf/pods/2020}, abstract = {The resilience of a Boolean query on a database is the minimum number of tuples that need to be deleted from the input tables in order to make the query false. A solution to this problem immediately translates into a solution for the more widely known problem of deletion propagation with source-side effects. In this paper, we give several novel results on the hardness of the resilience problem for conjunctive queries with self-joins, and, more specifically, we present a dichotomy result for the class of single-self-join binary queries with exactly two repeated relations occurring in the query. Unlike in the self-join free case, the concept of triad is not enough to fully characterize the complexity of resilience. We identify new structural properties, namely chains, confluences and permutations, which lead to various NP-hardness results. We also give novel involved reductions to network flow to show certain cases are in P. Although restricted, our results provide important insights into the problem of self-joins that we hope can help solve the general case of all conjunctive queries with self-joins in the future.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The resilience of a Boolean query on a database is the minimum number of tuples that need to be deleted from the input tables in order to make the query false. A solution to this problem immediately translates into a solution for the more widely known problem of deletion propagation with source-side effects. In this paper, we give several novel results on the hardness of the resilience problem for conjunctive queries with self-joins, and, more specifically, we present a dichotomy result for the class of single-self-join binary queries with exactly two repeated relations occurring in the query. Unlike in the self-join free case, the concept of triad is not enough to fully characterize the complexity of resilience. We identify new structural properties, namely chains, confluences and permutations, which lead to various NP-hardness results. We also give novel involved reductions to network flow to show certain cases are in P. Although restricted, our results provide important insights into the problem of self-joins that we hope can help solve the general case of all conjunctive queries with self-joins in the future. |
Xiangyao Yu, Matt Youill, Matthew E Woicik, Abdurrahman Ghanem, Marco Serafini, Ashraf Aboulnaga, Michael Stonebraker PushdownDB: Accelerating a DBMS Using S3 Computation Inproceedings 36th IEEE International Conference on Data Engineering, ICDE 2020, Dallas, TX, USA, April 20-24, 2020, pp. 1802–1805, 2020. @inproceedings{DBLP:conf/icde/YuYWGSAS20, title = {PushdownDB: Accelerating a DBMS Using S3 Computation}, author = {Xiangyao Yu and Matt Youill and Matthew E Woicik and Abdurrahman Ghanem and Marco Serafini and Ashraf Aboulnaga and Michael Stonebraker}, url = {https://doi.org/10.1109/ICDE48307.2020.00174 https://marcoserafini.github.io/papers/pushdown.pdf}, doi = {10.1109/ICDE48307.2020.00174}, year = {2020}, date = {2020-01-01}, booktitle = {36th IEEE International Conference on Data Engineering, ICDE 2020, Dallas, TX, USA, April 20-24, 2020}, pages = {1802--1805}, crossref = {DBLP:conf/icde/2020}, abstract = {This paper studies the effectiveness of pushing parts of DBMS analytics queries into the Simple Storage Service (S3) of Amazon Web Services (AWS), using a recently released capability called S3 Select. We show that some DBMS primitives (filter, projection, and aggregation) can always be cost-effectively moved into S3. Other more complex operations (join, top-K, and groupby) require reimplementation to take advantage of S3 Select and are often candidates for pushdown. We demonstrate these capabilities through experimentation using a new DBMS that we developed, PushdownDB. Experimentation with a collection of queries including TPC-H queries shows that PushdownDB is on average 30% cheaper and 6.7× faster than a baseline that does not use S3 Select.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper studies the effectiveness of pushing parts of DBMS analytics queries into the Simple Storage Service (S3) of Amazon Web Services (AWS), using a recently released capability called S3 Select. We show that some DBMS primitives (filter, projection, and aggregation) can always be cost-effectively moved into S3. Other more complex operations (join, top-K, and groupby) require reimplementation to take advantage of S3 Select and are often candidates for pushdown. We demonstrate these capabilities through experimentation using a new DBMS that we developed, PushdownDB. Experimentation with a collection of queries including TPC-H queries shows that PushdownDB is on average 30% cheaper and 6.7× faster than a baseline that does not use S3 Select. |
Xiaowei Zhu, Marco Serafini, Xiaosong Ma, Ashraf Aboulnaga, Wenguang Chen, Guanyu Feng LiveGraph: A Transactional Graph Storage System with Purely Sequential Adjacency List Scans Journal Article Proc. VLDB Endow., 13 (7), pp. 1020–1034, 2020. @article{DBLP:journals/pvldb/ZhuSMACF20, title = {LiveGraph: A Transactional Graph Storage System with Purely Sequential Adjacency List Scans}, author = {Xiaowei Zhu and Marco Serafini and Xiaosong Ma and Ashraf Aboulnaga and Wenguang Chen and Guanyu Feng}, url = {http://www.vldb.org/pvldb/vol13/p1020-zhu.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {7}, pages = {1020--1034}, abstract = {The specific characteristics of graph workloads make it hard to design a one-size-fits-all graph storage system. Systems that support transactional updates use data structures with poor data locality, which limits the efficiency of analytical workloads or even simple edge scans. Other systems run graph analytics workloads efficiently, but cannot properly support transactions. This paper presents LiveGraph, a graph storage system that outperforms both the best graph transactional systems and the best solutions for real-time graph analytics on fresh data. LiveGraph achieves this by ensuring that adjacency list scans, a key operation in graph workloads, are purely sequential: they never require random accesses even in presence of concurrent transactions. Such pure-sequential operations are enabled by combining a novel graph-aware data structure, the Transactional Edge Log (TEL), with a concurrency control mechanism that leverages TEL’s data layout. Our evaluation shows that LiveGraph significantly outperforms state-of-the-art (graph) database solutions on both transactional and real-time analytical workloads.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The specific characteristics of graph workloads make it hard to design a one-size-fits-all graph storage system. Systems that support transactional updates use data structures with poor data locality, which limits the efficiency of analytical workloads or even simple edge scans. Other systems run graph analytics workloads efficiently, but cannot properly support transactions. This paper presents LiveGraph, a graph storage system that outperforms both the best graph transactional systems and the best solutions for real-time graph analytics on fresh data. LiveGraph achieves this by ensuring that adjacency list scans, a key operation in graph workloads, are purely sequential: they never require random accesses even in presence of concurrent transactions. Such pure-sequential operations are enabled by combining a novel graph-aware data structure, the Transactional Edge Log (TEL), with a concurrency control mechanism that leverages TEL’s data layout. Our evaluation shows that LiveGraph significantly outperforms state-of-the-art (graph) database solutions on both transactional and real-time analytical workloads. |
Muhammad Bilal, Marco Serafini, Marco Canini, Rodrigo Rodrigues Do the Best Cloud Configurations Grow on Trees? An Experimental Evaluation of Black Box Algorithms for Optimizing Cloud Workloads Sub Journal Article Proc. VLDB Endow., 13 (11), pp. 2563–2575, 2020. @article{DBLP:journals/pvldb/0007SCR20, title = {Do the Best Cloud Configurations Grow on Trees? An Experimental Evaluation of Black Box Algorithms for Optimizing Cloud Workloads Sub}, author = {Muhammad Bilal and Marco Serafini and Marco Canini and Rodrigo Rodrigues}, url = {http://www.vldb.org/pvldb/vol13/p2563-bilal.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {11}, pages = {2563--2575}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Miro Mannino, Azza Abouzied, Peter J Haas, Alexandra Meliou sPaQLTooLs: A Stochastic Package Query Interface for Scalable Constrained Optimization Journal Article Proc. VLDB Endow., 13 (12), pp. 2881–2884, 2020. @article{DBLP:journals/pvldb/BrucatoMAHM20, title = {sPaQLTooLs: A Stochastic Package Query Interface for Scalable Constrained Optimization}, author = {Matteo Brucato and Miro Mannino and Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol13/p2881-brucato.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {12}, pages = {2881--2884}, abstract = {Everyone needs to make decisions under uncertainty and with limited resources, e.g., an investor who is building a stock portfolio subject to an investment budget and a bounded risk tolerance. Doing this with current technology is hard. There is a disconnect between software tools for data management, stochastic predictive modeling (e.g., simulation of future stock prices), and optimization; this leads to cumbersome analytical workflows. Moreover, current methods do not scale. To handle a broad class of uncertainty models, analysts approximate the original stochastic optimization problem by a large deterministic optimization problem that incorporates many “scenarios”, i.e., sample realizations of the uncertain data values. For large problems, a huge number of scenarios is required, often causing the solver to fail. We demonstrate sPaQLTooLs, a system for in-database specification and scalable solution of constrained optimization problems. The key ingredients are (i) a database-oriented specification of constrained stochastic optimization problems as “stochastic package queries” (SPQs), (ii) use of a Monte Carlo database to incorporate stochastic predictive models, and (iii) a new SUMMARYSEARCH algorithm for scalably solving SPQs with approximation guarantees. In this demonstration, the attendees will experience first-hand the difficulty of manually constructing feasible and high-quality portfolios, using real-world stock market data. We will then demonstrate how SUMMARYSEARCH can easily and efficiently help them find very good portfolios, while being orders of magnitude faster than prior methods.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Everyone needs to make decisions under uncertainty and with limited resources, e.g., an investor who is building a stock portfolio subject to an investment budget and a bounded risk tolerance. Doing this with current technology is hard. There is a disconnect between software tools for data management, stochastic predictive modeling (e.g., simulation of future stock prices), and optimization; this leads to cumbersome analytical workflows. Moreover, current methods do not scale. To handle a broad class of uncertainty models, analysts approximate the original stochastic optimization problem by a large deterministic optimization problem that incorporates many “scenarios”, i.e., sample realizations of the uncertain data values. For large problems, a huge number of scenarios is required, often causing the solver to fail. We demonstrate sPaQLTooLs, a system for in-database specification and scalable solution of constrained optimization problems. The key ingredients are (i) a database-oriented specification of constrained stochastic optimization problems as “stochastic package queries” (SPQs), (ii) use of a Monte Carlo database to incorporate stochastic predictive models, and (iii) a new SUMMARYSEARCH algorithm for scalably solving SPQs with approximation guarantees. In this demonstration, the attendees will experience first-hand the difficulty of manually constructing feasible and high-quality portfolios, using real-world stock market data. We will then demonstrate how SUMMARYSEARCH can easily and efficiently help them find very good portfolios, while being orders of magnitude faster than prior methods. |
Dan Zhang, Yoshihiko Suhara, Jinfeng Li, Madelon Hulsebos, Ç, Wang - Sato: Contextual Semantic Type Detection in Tables Journal Article Proc. VLDB Endow., 13 (11), pp. 1835–1848, 2020. @article{DBLP:journals/pvldb/ZhangSLHDT20, title = {Sato: Contextual Semantic Type Detection in Tables}, author = {Dan Zhang and Yoshihiko Suhara and Jinfeng Li and Madelon Hulsebos and Ç and Wang -}, url = {http://www.vldb.org/pvldb/vol13/p1835-zhang.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {11}, pages = {1835--1848}, abstract = {Detecting the semantic types of data columns in relational tables is important for various data preparation and information retrieval tasks such as data cleaning, schema matching, data discovery, and semantic search. However, existing detection approaches either perform poorly with dirty data, support only a limited number of semantic types, fail to incorporate the table context of columns or rely on large sample sizes for training data. We introduce Sato, a hybrid machine learning model to automatically detect the semantic types of columns in tables, exploiting the signals from the table context as well as the column values. Sato combines a deep learning model trained on a large-scale table corpus with topic modeling and structured prediction to achieve support-weighted and macro average F1 scores of 0.925 and 0.735, respectively, exceeding the state-of-theart performance by a significant margin. We extensively analyze the overall and per-type performance of Sato, discussing how individual modeling components, as well as feature categories, contribute to its performance.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Detecting the semantic types of data columns in relational tables is important for various data preparation and information retrieval tasks such as data cleaning, schema matching, data discovery, and semantic search. However, existing detection approaches either perform poorly with dirty data, support only a limited number of semantic types, fail to incorporate the table context of columns or rely on large sample sizes for training data. We introduce Sato, a hybrid machine learning model to automatically detect the semantic types of columns in tables, exploiting the signals from the table context as well as the column values. Sato combines a deep learning model trained on a large-scale table corpus with topic modeling and structured prediction to achieve support-weighted and macro average F1 scores of 0.925 and 0.735, respectively, exceeding the state-of-theart performance by a significant margin. We extensively analyze the overall and per-type performance of Sato, discussing how individual modeling components, as well as feature categories, contribute to its performance. |
Anna Fariha, Matteo Brucato, Peter J Haas, Alexandra Meliou SuDocu: Summarizing Documents by Example Journal Article Proc. VLDB Endow., 13 (12), pp. 2861–2864, 2020. @article{DBLP:journals/pvldb/FarihaBHM20, title = {SuDocu: Summarizing Documents by Example}, author = {Anna Fariha and Matteo Brucato and Peter J Haas and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol13/p2861-fariha.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {12}, pages = {2861--2864}, abstract = {Text document summarization refers to the task of producing a brief representation of a document for easy human consumption. Existing text summarization techniques mostly focus on generic summarization, but users often require personalized summarization that targets their specific preferences and needs. However, precisely expressing preferences is challenging, and current methods are often ambiguous, outside the user’s control, or require costly training data. We propose a novel and effective way to express summarization intent (preferences) via examples: the user provides a few example summaries for a small number of documents in a collection, and the system summarizes the rest. We demonstrate SUDOCU, an example-based personalized DOCUment SUmmarization system. Through a simple interface, SUDOCU allows the users to provide example summaries, learns the summarization intent from the examples, and produces summaries for new documents that reflect the user’s summarization intent. SUDOCU further explains the captured summarization intent in the form of a package query, an extension of a traditional SQL query that handles complex constraints and preferences over answer sets. SUDOCU combines topic modeling, semantic similarity discovery, and in-database optimization in a novel way to achieve example-driven document summarization. We demonstrate how SUDOCU can detect complex summarization intents from a few example summaries and produce accurate summaries for new documents effectively and efficiently.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Text document summarization refers to the task of producing a brief representation of a document for easy human consumption. Existing text summarization techniques mostly focus on generic summarization, but users often require personalized summarization that targets their specific preferences and needs. However, precisely expressing preferences is challenging, and current methods are often ambiguous, outside the user’s control, or require costly training data. We propose a novel and effective way to express summarization intent (preferences) via examples: the user provides a few example summaries for a small number of documents in a collection, and the system summarizes the rest. We demonstrate SUDOCU, an example-based personalized DOCUment SUmmarization system. Through a simple interface, SUDOCU allows the users to provide example summaries, learns the summarization intent from the examples, and produces summaries for new documents that reflect the user’s summarization intent. SUDOCU further explains the captured summarization intent in the form of a package query, an extension of a traditional SQL query that handles complex constraints and preferences over answer sets. SUDOCU combines topic modeling, semantic similarity discovery, and in-database optimization in a novel way to achieve example-driven document summarization. We demonstrate how SUDOCU can detect complex summarization intents from a few example summaries and produce accurate summaries for new documents effectively and efficiently. |
2019 |
Ahmed Elgohary, Matthias Boehm, Peter J Haas, Frederick R Reiss, Berthold Reinwald Compressed linear algebra for declarative large-scale machine learning Journal Article Commun. ACM, 62 (5), pp. 83–91, 2019. @article{DBLP:journals/cacm/ElgoharyBHRR19, title = {Compressed linear algebra for declarative large-scale machine learning}, author = {Ahmed Elgohary and Matthias Boehm and Peter J Haas and Frederick R Reiss and Berthold Reinwald}, url = {https://doi.org/10.1145/3318221}, doi = {10.1145/3318221}, year = {2019}, date = {2019-01-01}, journal = {Commun. ACM}, volume = {62}, number = {5}, pages = {83--91}, abstract = {Large-scale Machine Learning (ML) algorithms are often iterative, using repeated read-only data access and I/Obound matrix-vector multiplications. Hence, it is crucial for performance to fit the data into single-node or distributed main memory to enable fast matrix-vector operations. General-purpose compression struggles to achieve both good compression ratios and fast decompression for blockwise uncompressed operations. Therefore, we introduce Compressed Linear Algebra (CLA) for lossless matrix compression. CLA encodes matrices with lightweight, valuebased compression techniques and executes linear algebra operations directly on the compressed representations. We contribute effective column compression schemes, cacheconscious operations, and an efficient sampling-based compression algorithm. Our experiments show good compression ratios and operations performance close to the uncompressed case, which enables fitting larger datasets into available memory. We thereby obtain significant endto-end performance improvements}, keywords = {}, pubstate = {published}, tppubtype = {article} } Large-scale Machine Learning (ML) algorithms are often iterative, using repeated read-only data access and I/Obound matrix-vector multiplications. Hence, it is crucial for performance to fit the data into single-node or distributed main memory to enable fast matrix-vector operations. General-purpose compression struggles to achieve both good compression ratios and fast decompression for blockwise uncompressed operations. Therefore, we introduce Compressed Linear Algebra (CLA) for lossless matrix compression. CLA encodes matrices with lightweight, valuebased compression techniques and executes linear algebra operations directly on the compressed representations. We contribute effective column compression schemes, cacheconscious operations, and an efficient sampling-based compression algorithm. Our experiments show good compression ratios and operations performance close to the uncompressed case, which enables fitting larger datasets into available memory. We thereby obtain significant endto-end performance improvements |
Yanlei Diao, Pawel Guzewicz, Ioana Manolescu, Mirjana Mazuran Spade: A Modular Framework for Analytical Exploration of RDF Graphs Journal Article PVLDB, 12 (12), pp. 1926–1929, 2019. @article{DBLP:journals/pvldb/DiaoGMM19, title = {Spade: A Modular Framework for Analytical Exploration of RDF Graphs}, author = {Yanlei Diao and Pawel Guzewicz and Ioana Manolescu and Mirjana Mazuran}, url = {http://www.vldb.org/pvldb/vol12/p1926-diao.pdf https://hal.inria.fr/hal-02152844/document}, doi = {10.14778/3352063.3352101}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {12}, pages = {1926--1929}, abstract = {RDF data is complex; exploring it is hard, and can be done through many different metaphors. We have developed and propose to demonstrate Spade, a tool helping users discover meaningful content of an RDF graph by showing them the results of aggregation (OLAP-style) queries automatically identified from the data. Spade chooses aggregates that are visually interesting, a property formally based on statistic properties of the aggregation query results. While well understood for relational data, such exploration raises multiple challenges for RDF: facts, dimensions and measures have to be identified (as opposed to known beforehand); as there are more candidate aggregates, assessing their interestingness can be very costly; finally, ontologies bring novel specific challenges but also novel opportunities, enabling ontology-driven exploration from an aggregate initially proposed by the system. Spade is a generic, extensible framework, which we instantiated with: (i) novel methods for enumerating candidate measures and dimensions in the vast space of possibilities provided by an RDF graph; (ii) a set of aggregate interestingness functions; (iii) ontology-based interactive exploration, and (iv) efficient early-stop techniques for estimating the interestingness of an aggregate query. The demonstration will comprise interactive scenarios on a variety of large, interesting RDF graphs.}, keywords = {}, pubstate = {published}, tppubtype = {article} } RDF data is complex; exploring it is hard, and can be done through many different metaphors. We have developed and propose to demonstrate Spade, a tool helping users discover meaningful content of an RDF graph by showing them the results of aggregation (OLAP-style) queries automatically identified from the data. Spade chooses aggregates that are visually interesting, a property formally based on statistic properties of the aggregation query results. While well understood for relational data, such exploration raises multiple challenges for RDF: facts, dimensions and measures have to be identified (as opposed to known beforehand); as there are more candidate aggregates, assessing their interestingness can be very costly; finally, ontologies bring novel specific challenges but also novel opportunities, enabling ontology-driven exploration from an aggregate initially proposed by the system. Spade is a generic, extensible framework, which we instantiated with: (i) novel methods for enumerating candidate measures and dimensions in the vast space of possibilities provided by an RDF graph; (ii) a set of aggregate interestingness functions; (iii) ontology-based interactive exploration, and (iv) efficient early-stop techniques for estimating the interestingness of an aggregate query. The demonstration will comprise interactive scenarios on a variety of large, interesting RDF graphs. |
Khaled Zaouk, Fei Song, Chenghao Lyu, Arnab Sinha, Yanlei Diao, Prashant J Shenoy UDAO: A Next-Generation Unified Data Analytics Optimizer Journal Article PVLDB, 12 (12), pp. 1934–1937, 2019. @article{DBLP:journals/pvldb/ZaoukSLSDS19, title = {UDAO: A Next-Generation Unified Data Analytics Optimizer}, author = {Khaled Zaouk and Fei Song and Chenghao Lyu and Arnab Sinha and Yanlei Diao and Prashant J Shenoy}, url = {http://www.vldb.org/pvldb/vol12/p1934-zaouk.pdf}, doi = {10.14778/3352063.3352103}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {12}, pages = {1934--1937}, abstract = {Big data analytics systems today still lack the ability to take user performance goals and budgetary constraints, collectively referred to as “objectives”, and automatically configure an analytic job to achieve the objectives. This paper presents UDAO, a unified data analytics optimizer that can automatically determine the parameters of the runtime system, collectively called a job configuration, for general dataflow programs based on user objectives. UDAO embodies key techniques including in-situ modeling, which learns a model for each user objective in the same computing environment as the job is run, and multi-objective optimization, which computes a Pareto optimal set of job configurations to reveal tradeoffs between different objectives. Using benchmarks developed based on industry needs, our demonstration will allow the user to explore (1) learned models to gain insights into how various parameters affect user objectives; (2) Pareto frontiers to understand interesting tradeoffs between different objectives and how a configuration recommended by the optimizer explores these tradeoffs; (3) endto-end benefits that UDAO can provide over default configurations or those manually tuned by engineers.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Big data analytics systems today still lack the ability to take user performance goals and budgetary constraints, collectively referred to as “objectives”, and automatically configure an analytic job to achieve the objectives. This paper presents UDAO, a unified data analytics optimizer that can automatically determine the parameters of the runtime system, collectively called a job configuration, for general dataflow programs based on user objectives. UDAO embodies key techniques including in-situ modeling, which learns a model for each user objective in the same computing environment as the job is run, and multi-objective optimization, which computes a Pareto optimal set of job configurations to reveal tradeoffs between different objectives. Using benchmarks developed based on industry needs, our demonstration will allow the user to explore (1) learned models to gain insights into how various parameters affect user objectives; (2) Pareto frontiers to understand interesting tradeoffs between different objectives and how a configuration recommended by the optimizer explores these tradeoffs; (3) endto-end benefits that UDAO can provide over default configurations or those manually tuned by engineers. |
Ios Kotsogiannis, Yuchao Tao, Xi He, Maryam Fanaeepour, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau PrivateSQL: A Differentially Private SQL Query Engine Journal Article PVLDB, 12 (11), pp. 1371–1384, 2019. @article{DBLP:journals/pvldb/KotsogiannisTHF19, title = {PrivateSQL: A Differentially Private SQL Query Engine}, author = {Ios Kotsogiannis and Yuchao Tao and Xi He and Maryam Fanaeepour and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau}, url = {http://www.vldb.org/pvldb/vol12/p1371-kotsogiannis.pdf}, doi = {10.14778/3342263.3342274}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {11}, pages = {1371--1384}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Dan Zhang, Ryan McKenna, Ios Kotsogiannis, George Bissias, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau Ektelo: A Framework for Defining Differentially-Private Computations Journal Article SIGMOD Record, 48 (1), pp. 15–22, 2019. @article{DBLP:journals/sigmod/ZhangMKBHMM19, title = {Ektelo: A Framework for Defining Differentially-Private Computations}, author = {Dan Zhang and Ryan McKenna and Ios Kotsogiannis and George Bissias and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3371316.3371321}, doi = {10.1145/3371316.3371321}, year = {2019}, date = {2019-01-01}, journal = {SIGMOD Record}, volume = {48}, number = {1}, pages = {15--22}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Brian Hentschel, Peter J Haas, Yuanyuan Tian Online Model Management via Temporally Biased Sampling Journal Article SIGMOD Record, 48 (1), pp. 69–76, 2019. @article{DBLP:journals/sigmod/HentschelHT19, title = {Online Model Management via Temporally Biased Sampling}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {https://doi.org/10.1145/3371316.3371333 https://people.cs.umass.edu/~phaas/files/tbs-sigmod-record.pdf}, doi = {10.1145/3371316.3371333}, year = {2019}, date = {2019-01-01}, journal = {SIGMOD Record}, volume = {48}, number = {1}, pages = {69--76}, abstract = {To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally-biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying exponentially over time. We then periodically retrain the models on the current sample. We provide and analyze both a simple sampling scheme (T-TBS) that probabilistically maintains a target sample size and a novel reservoir-based scheme (R-TBS) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark show that our approach can increase machine learning accuracy and robustness in the face of evolving data.}, keywords = {}, pubstate = {published}, tppubtype = {article} } To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally-biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying exponentially over time. We then periodically retrain the models on the current sample. We provide and analyze both a simple sampling scheme (T-TBS) that probabilistically maintains a target sample size and a novel reservoir-based scheme (R-TBS) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark show that our approach can increase machine learning accuracy and robustness in the face of evolving data. |
2025 |
Simulation Optimization 2050 and Beyond Journal Article 2025 Winter Simulation Conference (WSC), 2025. |
PRAXA: A Framework for What-If Analysis Journal Article arXiv preprint arXiv:2510.09791, 2025. |
Springer Nature Switzerland, 2025. |
What-if analysis for business professionals: Current practices and future opportunities Journal Article Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems, pp. 1-17, 2025. |
Mechanistic modeling of social conditions in disease-prediction simulations via copulas and probabilistic graphical models: HIV case study Journal Article Health Care Management Science, 28 , pp. 28-49, 2025. |
Data management perspectives on prescriptive analytics (invited talk) Presentation 14.05.2025. |
2024 |
Tutorial: Artificial Neural Networks for Discrete-Event Simulation Journal Article 2024 Winter Simulation Conference (WSC), 2024. |
Stochastic sketchrefine: Scaling in-database decision-making under uncertainty to millions of tuples Journal Article arXiv preprint arXiv:2411.17915, 2024. |
2023 |
Efficient Hybrid Simulation Optimization via Graph Neural Network Metamodeling Journal Article 2023 Winter Simulation Conference (WSC), 2023. |
Causal Dynamic Bayesian Networks for Simulation Metamodeling Journal Article 2023 Winter Simulation Conference (WSC), pp. 746-757, 2023. |
GraphMini: Accelerating Graph Pattern Matching Using Auxiliary Graphs Journal Article International Conference on Parallel Architectures and Compilation Techniques, 2023. |
Piloting an Interactive Ethics and Responsible Computing Learning Environment in Undergraduate CS Courses Journal Article Proceedings of the 54th ACM Technical Symposium on Computer Science Education, 2023. |
NIM: Generative Neural Networks for Automated Modeling and Generation of Simulation Inputs Journal Article ACM Transactions on Modeling and Computer Simulation, 33 (3), pp. 1–26, 2023. |
Exact PPS Sampling with Bounded Sample Size Journal Article Information Processing Letters, 182 , 2023. |
Infectious Disease Modelling, 8 (1), pp. 84-100, 2023. |
2022 |
AIM: an adaptive and iterative mechanism for differentially private synthetic data Inproceedings Proceedings of the VLDB Endowment, pp. 2599–2612, 2022. |
Transitioning from testbeds to ships: an experience study in deploying the TIPPERS Internet of Things platform to the US Navy Journal Article The Journal of Defense Modeling and Simulation, 19 (3), pp. 501-517, 2022. |
DataPrism: Exposing Disconnect between Data and Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022, pp. 217-231, 2022. |
Through the data management lens: Experimental analysis and evaluation of fair classification Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022., pp. 232-246, 2022. |
Improved approximation and scalability for fair max-min diversification Inproceedings ICDT 2022, 2022. |
Enhanced Simulation Metamodeling via Graph and Generative Neural Networks Inproceedings Winter Simulation Conference, WSC 2022, Singapore, December 11-14, 2022, pp. 2748-2759, 2022. |
Augmenting Decision Making via Interactive What-If Analysis Inproceedings 12th Conference on Innovative Data Systems Research, CIDR 2022, Chaminade, CA, USA, January 9-12, 2022, 2022. |
In-Database Decision Support: Opportunities and Challenges Journal Article IEEE Data Engineering Bulletin, 45 (3), pp. 102-115, 2022. |
2021 |
Spark-based Cloud Data Analytics using Multi-Objective Optimization Inproceedings 37th IEEE International Conference on Data Engineering, ICDE 2021, Chania, Greece, April 19-22, 2021, pp. 396–407, IEEE, 2021. |
CoCo: Interactive Exploration of Conformance Constraints for Data Understanding and Data Cleaning Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. |
Conformance Constraint Discovery: Measuring Trust in Data-Driven Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. |
Diverse Data Selection under Fairness Constraints Inproceedings International Conference on Database Theory, (ICDT), pp. 11:1–11:25, 2021. |
Accelerating graph sampling for graph machine learning using GPUs Inproceedings EuroSys '21: Proceedings of the Sixteenth European Conference on Computer Systems, pp. 311–326, 2021. |
FlexPushdownDB: Hybrid Pushdown and Caching in a Cloud DBMS Inproceedings Proceedings of the VLDB Endowment, pp. 2101–2113, 2021. |
Relaxed marginal consistency for differentially private query answering Inproceedings Advances in Neural Information Processing Systems, pp. 20696-20707, 2021. |
Winning the NIST Contest: A scalable and general approach to differentially private synthetic data Journal Article Journal of Privacy and Confidentiality, 11 (3), 2021. |
Scalable graph neural network training: The case for sampling Journal Article ACM SIGOPS Operating Systems Review, 55 (1), pp. 68-76, 2021. |
2020 |
Stochastic Package Queries in Probabilistic Databases Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 269–283, 2020. |
ϵKTELO: A Framework for Defining Differentially Private Computations Journal Article ACM Trans. Database Syst., 45 (1), pp. 2:1–2:44, 2020. |
Fair decision making using privacy-protected data Inproceedings FAT* '20: Conference on Fairness, Accountability, and Transparency, Barcelona, Spain, January 27-30, 2020, pp. 189–199, 2020. |
Causality-Guided Adaptive Interventional Debugging Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 431–446, 2020. |
ExTuNe: Explaining Tuple Non-conformance Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 2741–2744, 2020. |
New Results for the Complexity of Resilience for Binary Conjunctive Queries with Self-Joins Inproceedings Proceedings of the 39th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems, PODS 2020, Portland, OR, USA, June 14-19, 2020, pp. 271–284, 2020. |
PushdownDB: Accelerating a DBMS Using S3 Computation Inproceedings 36th IEEE International Conference on Data Engineering, ICDE 2020, Dallas, TX, USA, April 20-24, 2020, pp. 1802–1805, 2020. |
LiveGraph: A Transactional Graph Storage System with Purely Sequential Adjacency List Scans Journal Article Proc. VLDB Endow., 13 (7), pp. 1020–1034, 2020. |
Do the Best Cloud Configurations Grow on Trees? An Experimental Evaluation of Black Box Algorithms for Optimizing Cloud Workloads Sub Journal Article Proc. VLDB Endow., 13 (11), pp. 2563–2575, 2020. |
sPaQLTooLs: A Stochastic Package Query Interface for Scalable Constrained Optimization Journal Article Proc. VLDB Endow., 13 (12), pp. 2881–2884, 2020. |
Sato: Contextual Semantic Type Detection in Tables Journal Article Proc. VLDB Endow., 13 (11), pp. 1835–1848, 2020. |
SuDocu: Summarizing Documents by Example Journal Article Proc. VLDB Endow., 13 (12), pp. 2861–2864, 2020. |
2019 |
Compressed linear algebra for declarative large-scale machine learning Journal Article Commun. ACM, 62 (5), pp. 83–91, 2019. |
Spade: A Modular Framework for Analytical Exploration of RDF Graphs Journal Article PVLDB, 12 (12), pp. 1926–1929, 2019. |
UDAO: A Next-Generation Unified Data Analytics Optimizer Journal Article PVLDB, 12 (12), pp. 1934–1937, 2019. |
PrivateSQL: A Differentially Private SQL Query Engine Journal Article PVLDB, 12 (11), pp. 1371–1384, 2019. |
Ektelo: A Framework for Defining Differentially-Private Computations Journal Article SIGMOD Record, 48 (1), pp. 15–22, 2019. |
Online Model Management via Temporally Biased Sampling Journal Article SIGMOD Record, 48 (1), pp. 69–76, 2019. |