2023 |
Juelin Liu, Sandeep Polisetty, Hui Guan, Marco Serafini GraphMini: Accelerating Graph Pattern Matching Using Auxiliary Graphs Journal Article International Conference on Parallel Architectures and Compilation Techniques, 2023. @article{liu2023graphmini, title = {GraphMini: Accelerating Graph Pattern Matching Using Auxiliary Graphs}, author = {Juelin Liu and Sandeep Polisetty and Hui Guan and Marco Serafini}, url = {https://marcoserafini.github.io/assets/pdf/GraphMini.pdf}, year = {2023}, date = {2023-11-01}, journal = {International Conference on Parallel Architectures and Compilation Techniques}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Francisco Castro, Sahitya Raipura, Heather Conboy, Peter J Haas, Leon Osterweil, Ivon Arroyo Piloting an Interactive Ethics and Responsible Computing Learning Environment in Undergraduate CS Courses Journal Article Proceedings of the 54th ACM Technical Symposium on Computer Science Education, 2023. @article{castro2023piloting, title = {Piloting an Interactive Ethics and Responsible Computing Learning Environment in Undergraduate CS Courses}, author = {Francisco Castro and Sahitya Raipura and Heather Conboy and Peter J Haas and Leon Osterweil and Ivon Arroyo}, year = {2023}, date = {2023-11-01}, journal = {Proceedings of the 54th ACM Technical Symposium on Computer Science Education}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Wang Cen, Peter J Haas NIM: Generative Neural Networks for Automated Modeling and Generation of Simulation Inputs Journal Article ACM Transactions on Modeling and Computer Simulation, 33 (3), pp. 1–26, 2023. @article{cen2023nim, title = {NIM: Generative Neural Networks for Automated Modeling and Generation of Simulation Inputs}, author = {Wang Cen and Peter J Haas}, year = {2023}, date = {2023-11-01}, journal = {ACM Transactions on Modeling and Computer Simulation}, volume = {33}, number = {3}, pages = {1--26}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Chaitra Gopalappa, Hari Balasubramanian, Peter J Haas Infectious Disease Modelling, 8 (1), pp. 84-100, 2023. @article{gopalappa2023new, title = {A new mixed agent-based network and compartmental simulation framework for joint modeling of related infectious diseases- application to sexually transmitted infections}, author = {Chaitra Gopalappa and Hari Balasubramanian and Peter J Haas}, doi = {https://doi.org/10.1016/j.idm.2022.12.003}, year = {2023}, date = {2023-03-01}, journal = {Infectious Disease Modelling}, volume = {8}, number = {1}, pages = {84-100}, abstract = {Background A model that jointly simulates infectious diseases with common modes of transmission can serve as a decision-analytic tool to identify optimal intervention combinations for overall disease prevention. In the United States, sexually transmitted infections (STIs) are a huge economic burden, with a large fraction of the burden attributed to HIV. Data also show interactions between HIV and other sexually transmitted infections (STIs), such as higher risk of acquisition and progression of co-infections among persons with HIV compared to persons without. However, given the wide range in prevalence and incidence burdens of STIs, current compartmental or agent-based network simulation methods alone are insufficient or computationally burdensome for joint disease modeling. Further, causal factors for higher risk of coinfection could be both behavioral (i.e., compounding effects of individual behaviors, network structures, and care behaviors) and biological (i.e., presence of one disease can biologically increase the risk of another). However, the data on the fraction attributed to each are limited. Methods We present a new mixed agent-based compartmental (MAC) framework for jointly modeling STIs. It uses a combination of a new agent-based evolving network modeling (ABENM) technique for lower-prevalence diseases and compartmental modeling for higher-prevalence diseases. As a demonstration, we applied MAC to simulate lower-prevalence HIV in the United States and a higher-prevalence hypothetical Disease 2, using a range of transmission and progression rates to generate burdens replicative of the wide range of STIs. We simulated sexual transmissions among heterosexual males, heterosexual females, and men who have sex with men (men only and men and women). Setting the biological risk of co-infection to zero, we conducted numerical analyses to evaluate the influence of behavioral factors alone on disease dynamics. Results The contribution of behavioral factors to risk of coinfection was sensitive to disease burden, care access, and population heterogeneity and mixing. The contribution of behavioral factors was generally lower than observed risk of coinfections for the range of hypothetical prevalence studied here, suggesting potential role of biological factors, that should be investigated further specific to an STI. Conclusions The purpose of this study is to present a new simulation technique for jointly modeling infectious diseases that have common modes of transmission but varying epidemiological features. The numerical analysis serves as proof-of-concept for the application to STIs. Interactions between diseases are influenced by behavioral factors, are sensitive to care access and population features, and are likely exacerbated by biological factors. Social and economic conditions are among key drivers of behaviors that increase STI transmission, and thus, structural interventions are a key part of behavioral interventions. Joint modeling of diseases helps comprehensively simulate behavioral and biological factors of disease interactions to evaluate the true impact of common structural interventions on overall disease prevention. The new simulation framework is especially suited to simulate behavior as a function of social determinants, and further, to identify optimal combinations of common structural and disease-specific interventions.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Background A model that jointly simulates infectious diseases with common modes of transmission can serve as a decision-analytic tool to identify optimal intervention combinations for overall disease prevention. In the United States, sexually transmitted infections (STIs) are a huge economic burden, with a large fraction of the burden attributed to HIV. Data also show interactions between HIV and other sexually transmitted infections (STIs), such as higher risk of acquisition and progression of co-infections among persons with HIV compared to persons without. However, given the wide range in prevalence and incidence burdens of STIs, current compartmental or agent-based network simulation methods alone are insufficient or computationally burdensome for joint disease modeling. Further, causal factors for higher risk of coinfection could be both behavioral (i.e., compounding effects of individual behaviors, network structures, and care behaviors) and biological (i.e., presence of one disease can biologically increase the risk of another). However, the data on the fraction attributed to each are limited. Methods We present a new mixed agent-based compartmental (MAC) framework for jointly modeling STIs. It uses a combination of a new agent-based evolving network modeling (ABENM) technique for lower-prevalence diseases and compartmental modeling for higher-prevalence diseases. As a demonstration, we applied MAC to simulate lower-prevalence HIV in the United States and a higher-prevalence hypothetical Disease 2, using a range of transmission and progression rates to generate burdens replicative of the wide range of STIs. We simulated sexual transmissions among heterosexual males, heterosexual females, and men who have sex with men (men only and men and women). Setting the biological risk of co-infection to zero, we conducted numerical analyses to evaluate the influence of behavioral factors alone on disease dynamics. Results The contribution of behavioral factors to risk of coinfection was sensitive to disease burden, care access, and population heterogeneity and mixing. The contribution of behavioral factors was generally lower than observed risk of coinfections for the range of hypothetical prevalence studied here, suggesting potential role of biological factors, that should be investigated further specific to an STI. Conclusions The purpose of this study is to present a new simulation technique for jointly modeling infectious diseases that have common modes of transmission but varying epidemiological features. The numerical analysis serves as proof-of-concept for the application to STIs. Interactions between diseases are influenced by behavioral factors, are sensitive to care access and population features, and are likely exacerbated by biological factors. Social and economic conditions are among key drivers of behaviors that increase STI transmission, and thus, structural interventions are a key part of behavioral interventions. Joint modeling of diseases helps comprehensively simulate behavioral and biological factors of disease interactions to evaluate the true impact of common structural interventions on overall disease prevention. The new simulation framework is especially suited to simulate behavior as a function of social determinants, and further, to identify optimal combinations of common structural and disease-specific interventions. |
Brian Hentschel, Peter J Haas, Yuanyuan Tian Exact PPS Sampling with Bounded Sample Size Journal Article Information Processing Letters, 182 , 2023. @article{hentschel2023exact, title = {Exact PPS Sampling with Bounded Sample Size}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {https://www.sciencedirect.com/science/article/pii/S002001902300025X?casa_token=znQuScsK51AAAAAA:0EEwp2QPaW7PetEN0NGy8wN0dsOC6Thx9voXduNx6ivnyMIg0WVuMI83TVVt2yiXqu0n4Atnwg}, year = {2023}, date = {2023-08-01}, journal = {Information Processing Letters}, volume = {182}, abstract = {Probability proportional to size (PPS) sampling schemes with a target sample size aim to produce a sample comprising a specified number n of items while ensuring that each item in the population appears in the sample with a probability proportional to its specified "weight" (also called its "size"). These two objectives, however, cannot always be achieved simultaneously. Existing PPS schemes prioritize control of the sample size, violating the PPS property if necessary. We provide a new PPS scheme that allows a different trade-off: our method enforces the PPS property at all times while ensuring that the sample size never exceeds the target value n. The sample size is exactly equal to n if possible, and otherwise has maximal expected value and minimal variance. Thus we bound the sample size, thereby avoiding storage overflows and helping to control the time required for analytics over the sample, while allowing the user complete control over the sample contents. The method is both simple to implement and efficient, being a one-pass streaming algorithm with an amortized processing time of O(1) per item.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Probability proportional to size (PPS) sampling schemes with a target sample size aim to produce a sample comprising a specified number n of items while ensuring that each item in the population appears in the sample with a probability proportional to its specified "weight" (also called its "size"). These two objectives, however, cannot always be achieved simultaneously. Existing PPS schemes prioritize control of the sample size, violating the PPS property if necessary. We provide a new PPS scheme that allows a different trade-off: our method enforces the PPS property at all times while ensuring that the sample size never exceeds the target value n. The sample size is exactly equal to n if possible, and otherwise has maximal expected value and minimal variance. Thus we bound the sample size, thereby avoiding storage overflows and helping to control the time required for analytics over the sample, while allowing the user complete control over the sample contents. The method is both simple to implement and efficient, being a one-pass streaming algorithm with an amortized processing time of O(1) per item. |
2022 |
Ryan McKenna, Brett Mullins, Daniel Sheldon, Gerome Miklau AIM: an adaptive and iterative mechanism for differentially private synthetic data Inproceedings Proceedings of the VLDB Endowment, pp. 2599–2612, 2022. @inproceedings{McKenna2022AIM, title = {AIM: an adaptive and iterative mechanism for differentially private synthetic data}, author = {Ryan McKenna and Brett Mullins and Daniel Sheldon and Gerome Miklau}, doi = {https://doi.org/10.14778/3551793.3551817}, year = {2022}, date = {2022-06-01}, booktitle = {Proceedings of the VLDB Endowment}, journal = {Proceedings of the VLDB Endowment}, volume = {15}, number = {11}, pages = {2599–2612}, abstract = {We propose AIM, a new algorithm for differentially private synthetic data generation. AIM is a workload-adaptive algorithm within the paradigm of algorithms that first selects a set of queries, then privately measures those queries, and finally generates synthetic data from the noisy measurements. It uses a set of innovative features to iteratively select the most useful measurements, reflecting both their relevance to the workload and their value in approximating the input data. We also provide analytic expressions to bound per-query error with high probability which can be used to construct confidence intervals and inform users about the accuracy of generated data. We show empirically that AIM consistently outperforms a wide variety of existing mechanisms across a variety of experimental settings.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We propose AIM, a new algorithm for differentially private synthetic data generation. AIM is a workload-adaptive algorithm within the paradigm of algorithms that first selects a set of queries, then privately measures those queries, and finally generates synthetic data from the noisy measurements. It uses a set of innovative features to iteratively select the most useful measurements, reflecting both their relevance to the workload and their value in approximating the input data. We also provide analytic expressions to bound per-query error with high probability which can be used to construct confidence intervals and inform users about the accuracy of generated data. We show empirically that AIM consistently outperforms a wide variety of existing mechanisms across a variety of experimental settings. |
Dave Archer, Michael A August, Georgios Bouloukakis, Christopher Davison, Mamadou H Diallo, Dhrubajyoti Ghosh, Christopher T Graves, Michael Hay, Xi He, Peeter Laud, Steve Lu, Ashwin Machanavajjhala, Sharad Mehrotra, Gerome Miklau, Alisa Pankova, Shantanu Sharma, Nalini Venkatasubramanian, Guoxi Wang, Roberto Yus Transitioning from testbeds to ships: an experience study in deploying the TIPPERS Internet of Things platform to the US Navy Journal Article The Journal of Defense Modeling and Simulation, 19 (3), pp. 501-517, 2022. @article{Archer2022transition, title = {Transitioning from testbeds to ships: an experience study in deploying the TIPPERS Internet of Things platform to the US Navy}, author = {Dave Archer and Michael A August and Georgios Bouloukakis and Christopher Davison and Mamadou H Diallo and Dhrubajyoti Ghosh and Christopher T Graves and Michael Hay, Xi He, Peeter Laud and Steve Lu and Ashwin Machanavajjhala and Sharad Mehrotra and Gerome Miklau and Alisa Pankova and Shantanu Sharma and Nalini Venkatasubramanian and Guoxi Wang and Roberto Yus}, doi = {https://doi.org/10.1177/154851292095638}, year = {2022}, date = {2022-07-01}, journal = {The Journal of Defense Modeling and Simulation}, volume = {19}, number = {3}, pages = {501-517}, abstract = {This paper describes the collaborative effort between privacy and security researchers at nine different institutions along with researchers at the Naval Information Warfare Center to deploy, test, and demonstrate privacy-preserving technologies in creating sensor-based awareness using the Internet of Things (IoT) aboard naval vessels in the context of the US Navy’s Trident Warrior 2019 exercise. Funded by DARPA through the Brandeis program, the team built an integrated IoT data management middleware, entitled TIPPERS, that supports privacy by design and integrates a variety of Privacy Enhancing Technologies (PETs), including differential privacy, computation on encrypted data, and fine-grained policies. We describe the architecture of TIPPERS and its use in creating a smart ship that offers IoT-enabled services such as occupancy analysis, fall detection, detection of unauthorized access to spaces, and other situational awareness scenarios. We describe the privacy implications of creating IoT spaces that collect data that might include individuals’ data (e.g., location) and analyze the tradeoff between privacy and utility of the supported PETs in this context.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This paper describes the collaborative effort between privacy and security researchers at nine different institutions along with researchers at the Naval Information Warfare Center to deploy, test, and demonstrate privacy-preserving technologies in creating sensor-based awareness using the Internet of Things (IoT) aboard naval vessels in the context of the US Navy’s Trident Warrior 2019 exercise. Funded by DARPA through the Brandeis program, the team built an integrated IoT data management middleware, entitled TIPPERS, that supports privacy by design and integrates a variety of Privacy Enhancing Technologies (PETs), including differential privacy, computation on encrypted data, and fine-grained policies. We describe the architecture of TIPPERS and its use in creating a smart ship that offers IoT-enabled services such as occupancy analysis, fall detection, detection of unauthorized access to spaces, and other situational awareness scenarios. We describe the privacy implications of creating IoT spaces that collect data that might include individuals’ data (e.g., location) and analyze the tradeoff between privacy and utility of the supported PETs in this context. |
Sainyam Galhotra, Anna Fariha, Raoni Lourenço, Juliana Freire, Alexandra Meliou, Divesh Srivastava DataPrism: Exposing Disconnect between Data and Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022, pp. 217-231, 2022. @inproceedings{galhotra2022dataprism, title = {DataPrism: Exposing Disconnect between Data and Systems}, author = {Sainyam Galhotra and Anna Fariha and Raoni Lourenço and Juliana Freire and Alexandra Meliou and Divesh Srivastava}, doi = {https://doi.org/10.1145/3514221.3517864}, year = {2022}, date = {2022-06-01}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022}, pages = {217-231}, abstract = {As data is a central component of many modern systems, the cause of a system malfunction may reside in the data, and, specifically, particular properties of data. E.g., a health-monitoring system that is designed under the assumption that weight is reported in lbs will malfunction when encountering weight reported in kilograms. Like software debugging, which aims to find bugs in the source code or runtime conditions, our goal is to debug data to identify potential sources of disconnect between the assumptions about some data and systems that operate on that data. We propose DataPrism, a framework to identify data properties (profiles) that are the root causes of performance degradation or failure of a data-driven system. Such identification is necessary to repair data and resolve the disconnect between data and systems. Our technique is based on causal reasoning through interventions: when a system malfunctions for a dataset, DataPrism alters the data profiles and observes changes in the system's behavior due to the alteration. Unlike statistical observational analysis that reports mere correlations, DataPrism reports causally verified root causes -- in terms of data profiles -- of the system malfunction. We empirically evaluate DataPrism on seven real-world and several synthetic data-driven systems that fail on certain datasets due to a diverse set of reasons. In all cases, DataPrism identifies the root causes precisely while requiring orders of magnitude fewer interventions than prior techniques.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } As data is a central component of many modern systems, the cause of a system malfunction may reside in the data, and, specifically, particular properties of data. E.g., a health-monitoring system that is designed under the assumption that weight is reported in lbs will malfunction when encountering weight reported in kilograms. Like software debugging, which aims to find bugs in the source code or runtime conditions, our goal is to debug data to identify potential sources of disconnect between the assumptions about some data and systems that operate on that data. We propose DataPrism, a framework to identify data properties (profiles) that are the root causes of performance degradation or failure of a data-driven system. Such identification is necessary to repair data and resolve the disconnect between data and systems. Our technique is based on causal reasoning through interventions: when a system malfunctions for a dataset, DataPrism alters the data profiles and observes changes in the system's behavior due to the alteration. Unlike statistical observational analysis that reports mere correlations, DataPrism reports causally verified root causes -- in terms of data profiles -- of the system malfunction. We empirically evaluate DataPrism on seven real-world and several synthetic data-driven systems that fail on certain datasets due to a diverse set of reasons. In all cases, DataPrism identifies the root causes precisely while requiring orders of magnitude fewer interventions than prior techniques. |
Maliha Tashfia Islam, Anna Fariha, Alexandra Meliou, Babak Salimi Through the data management lens: Experimental analysis and evaluation of fair classification Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022., pp. 232-246, 2022. @inproceedings{islam2022through, title = {Through the data management lens: Experimental analysis and evaluation of fair classification}, author = {Maliha Tashfia Islam and Anna Fariha and Alexandra Meliou and Babak Salimi}, doi = {https://doi.org/10.1145/3514221.3517841}, year = {2022}, date = {2022-06-01}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022.}, pages = {232-246}, abstract = {Classification, a heavily-studied data-driven machine learning task, drives an increasing number of prediction systems involving critical human decisions such as loan approval and criminal risk assessment. However, classifiers often demonstrate discriminatory behavior, especially when presented with biased data. Consequently, fairness in classification has emerged as a high-priority research area. Data management research is showing an increasing presence and interest in topics related to data and algorithmic fairness, including the topic of fair classification. The interdisciplinary efforts in fair classification, with machine learning research having the largest presence, have resulted in a large number of fairness notions and a wide range of approaches that have not been systematically evaluated and compared. In this paper, we contribute a broad analysis of 13 fair classification approaches and additional variants, over their correctness, fairness, efficiency, scalability, robustness to data errors, sensitivity to underlying ML model, data efficiency, and stability using a variety of metrics and real-world datasets. Our analysis highlights novel insights on the impact of different metrics and high-level approach characteristics on different aspects of performance. We also discuss general principles for choosing approaches suitable for different practical settings, and identify areas where data-management-centric solutions are likely to have the most impact.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Classification, a heavily-studied data-driven machine learning task, drives an increasing number of prediction systems involving critical human decisions such as loan approval and criminal risk assessment. However, classifiers often demonstrate discriminatory behavior, especially when presented with biased data. Consequently, fairness in classification has emerged as a high-priority research area. Data management research is showing an increasing presence and interest in topics related to data and algorithmic fairness, including the topic of fair classification. The interdisciplinary efforts in fair classification, with machine learning research having the largest presence, have resulted in a large number of fairness notions and a wide range of approaches that have not been systematically evaluated and compared. In this paper, we contribute a broad analysis of 13 fair classification approaches and additional variants, over their correctness, fairness, efficiency, scalability, robustness to data errors, sensitivity to underlying ML model, data efficiency, and stability using a variety of metrics and real-world datasets. Our analysis highlights novel insights on the impact of different metrics and high-level approach characteristics on different aspects of performance. We also discuss general principles for choosing approaches suitable for different practical settings, and identify areas where data-management-centric solutions are likely to have the most impact. |
Raghavendra Addanki, Andrew McGregor, Alexandra Meliou, Zafeiria Moumoulidou Improved approximation and scalability for fair max-min diversification Inproceedings ICDT 2022, 2022. @inproceedings{addanki2022improved, title = {Improved approximation and scalability for fair max-min diversification}, author = {Raghavendra Addanki and Andrew McGregor and Alexandra Meliou and Zafeiria Moumoulidou}, url = {https://arxiv.org/abs/2201.06678}, year = {2022}, date = {2022-01-01}, booktitle = {ICDT 2022}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Wang Cen, Peter J. Haas Enhanced Simulation Metamodeling via Graph and Generative Neural Networks Inproceedings Winter Simulation Conference, WSC 2022, Singapore, December 11-14, 2022, pp. 2748-2759, 2022. @inproceedings{cen2022enhanced, title = {Enhanced Simulation Metamodeling via Graph and Generative Neural Networks}, author = {Wang Cen and Peter J. Haas}, doi = {https://doi.org/10.1109/WSC57314.2022.10015361}, year = {2022}, date = {2022-01-01}, booktitle = {Winter Simulation Conference, WSC 2022, Singapore, December 11-14, 2022}, pages = {2748-2759}, abstract = {For large, complex simulation models, simulation metamodeling is crucial for enabling simulation-based-optimization under uncertainty in operational settings where results are needed quickly. We enhance simulation metamodeling in two important ways. First, we use graph neural networks (GrNN) to allow the graphical structure of a simulation model to be treated as a metamodel input parameter that can be varied along with real-valued and integer-ordered inputs. Second, we combine GrNNs with generative neural networks so that a metamodel can rapidly produce not only a summary statistic like E[Y] , but also a sequence of i.i.d. samples of Y or even a stochastic process that mimics dynamic simulation outputs. Thus a single metamodel can be used to estimate multiple statistics for multiple performance measures. Our metamodels can potentially serve as surrogate models in digital-twin settings. Preliminary experiments indicate the promise of our approach.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } For large, complex simulation models, simulation metamodeling is crucial for enabling simulation-based-optimization under uncertainty in operational settings where results are needed quickly. We enhance simulation metamodeling in two important ways. First, we use graph neural networks (GrNN) to allow the graphical structure of a simulation model to be treated as a metamodel input parameter that can be varied along with real-valued and integer-ordered inputs. Second, we combine GrNNs with generative neural networks so that a metamodel can rapidly produce not only a summary statistic like E[Y] , but also a sequence of i.i.d. samples of Y or even a stochastic process that mimics dynamic simulation outputs. Thus a single metamodel can be used to estimate multiple statistics for multiple performance measures. Our metamodels can potentially serve as surrogate models in digital-twin settings. Preliminary experiments indicate the promise of our approach. |
Sneha Gathani, Madelon Hulsebos, James Gale, Peter J. Haas, Çağatay Demiralp Augmenting Decision Making via Interactive What-If Analysis Inproceedings 12th Conference on Innovative Data Systems Research, CIDR 2022, Chaminade, CA, USA, January 9-12, 2022, 2022. @inproceedings{gathani2021augmenting, title = {Augmenting Decision Making via Interactive What-If Analysis}, author = {Sneha Gathani and Madelon Hulsebos and James Gale and Peter J. Haas and Çağatay Demiralp }, url = {https://www.cidrdb.org/cidr2022/papers/p49-gathani.pdf}, year = {2022}, date = {2022-01-09}, booktitle = {12th Conference on Innovative Data Systems Research, CIDR 2022, Chaminade, CA, USA, January 9-12, 2022}, abstract = {The fundamental goal of business data analysis is to improve business decisions using data. Business users often make decisions to achieve key performance indicators (KPIs) such as increasing customer retention or sales, or decreasing costs. To discover the relationship between data attributes hypothesized to be drivers and those corresponding to KPIs of interest, business users currently need to perform lengthy exploratory analyses. This involves considering multitudes of combinations and scenarios and performing slicing, dicing, and transformations on the data accordingly, e.g., analyzing customer retention across quarters of the year or suggesting optimal media channels across strata of customers. However, the increasing complexity of datasets combined with the cognitive limitations of humans makes it challenging to carry over multiple hypotheses, even for simple datasets. Therefore mentally performing such analyses is hard. Existing commercial tools either provide partial solutions or fail to cater to business users altogether. Here we argue for four functionalities to enable business users to interactively learn and reason about the relationships between sets of data attributes thereby facilitating data-driven decision making. We implement these functionalities in SystemD, an interactive visual data analysis system enabling business users to experiment with the data by asking what-if questions. We evaluate the system through three business use cases: marketing mix modeling, customer retention analysis, and deal closing analysis, and report on feedback from multiple business users. Users find the SystemD functionalities highly useful for quick testing and validation of their hypotheses around their KPIs of interest, addressing their unmet analysis needs. The feedback also suggests that the UX design can be enhanced to further improve the understandability of these functionalities.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The fundamental goal of business data analysis is to improve business decisions using data. Business users often make decisions to achieve key performance indicators (KPIs) such as increasing customer retention or sales, or decreasing costs. To discover the relationship between data attributes hypothesized to be drivers and those corresponding to KPIs of interest, business users currently need to perform lengthy exploratory analyses. This involves considering multitudes of combinations and scenarios and performing slicing, dicing, and transformations on the data accordingly, e.g., analyzing customer retention across quarters of the year or suggesting optimal media channels across strata of customers. However, the increasing complexity of datasets combined with the cognitive limitations of humans makes it challenging to carry over multiple hypotheses, even for simple datasets. Therefore mentally performing such analyses is hard. Existing commercial tools either provide partial solutions or fail to cater to business users altogether. Here we argue for four functionalities to enable business users to interactively learn and reason about the relationships between sets of data attributes thereby facilitating data-driven decision making. We implement these functionalities in SystemD, an interactive visual data analysis system enabling business users to experiment with the data by asking what-if questions. We evaluate the system through three business use cases: marketing mix modeling, customer retention analysis, and deal closing analysis, and report on feedback from multiple business users. Users find the SystemD functionalities highly useful for quick testing and validation of their hypotheses around their KPIs of interest, addressing their unmet analysis needs. The feedback also suggests that the UX design can be enhanced to further improve the understandability of these functionalities. |
Azza Abouzied, Peter J Haas, Alexandra Meliou In-Database Decision Support: Opportunities and Challenges Journal Article IEEE Data Engineering Bulletin, 45 (3), pp. 102-115, 2022. @article{abouzied2022database, title = {In-Database Decision Support: Opportunities and Challenges}, author = {Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {https://people.cs.umass.edu/~phaas/files/DEBulletin2022.pdf}, year = {2022}, date = {2022-09-01}, journal = {IEEE Data Engineering Bulletin}, volume = {45}, number = {3}, pages = {102-115}, abstract = {Decision makers in a broad range of domains, such as finance, transportation, manufacturing, and healthcare, often need to derive optimal decisions given a set of constraints and objectives. Traditional solutions to such constrained optimization problems are typically application-specific, complex, and do not generalize. Further, the usual workflow requires slow, cumbersome, and error-prone data movement between a database and predictive-modeling and optimization packages. All of these problems are ex- acerbated by the unprecedented size of modern data-intensive optimization problems. The emerging re- search area of in-database prescriptive analytics aims to provide seamless domain-independent, declar- ative, and scalable approaches powered by the system where the data typically resides: the database. Integrating optimization with database technology opens up prescriptive analytics to a much broader community, amplifying its benefits. In the context of our prior and ongoing work in this area, we discuss some strategies for addressing key challenges related to usability, scalability, data uncertainty, dynamic environments with changing data and models, and the need to support decision-making agents. We in- dicate how deep integration between the DBMS, predictive models, and optimization software creates opportunities for rich prescriptive-query functionality with good scalability and performance.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Decision makers in a broad range of domains, such as finance, transportation, manufacturing, and healthcare, often need to derive optimal decisions given a set of constraints and objectives. Traditional solutions to such constrained optimization problems are typically application-specific, complex, and do not generalize. Further, the usual workflow requires slow, cumbersome, and error-prone data movement between a database and predictive-modeling and optimization packages. All of these problems are ex- acerbated by the unprecedented size of modern data-intensive optimization problems. The emerging re- search area of in-database prescriptive analytics aims to provide seamless domain-independent, declar- ative, and scalable approaches powered by the system where the data typically resides: the database. Integrating optimization with database technology opens up prescriptive analytics to a much broader community, amplifying its benefits. In the context of our prior and ongoing work in this area, we discuss some strategies for addressing key challenges related to usability, scalability, data uncertainty, dynamic environments with changing data and models, and the need to support decision-making agents. We in- dicate how deep integration between the DBMS, predictive models, and optimization software creates opportunities for rich prescriptive-query functionality with good scalability and performance. |
2021 |
Ryan McKenna, Gerome Miklau, Daniel Sheldon Winning the NIST Contest: A scalable and general approach to differentially private synthetic data Journal Article Journal of Privacy and Confidentiality, 11 (3), 2021. @article{McKenna2021winning, title = {Winning the NIST Contest: A scalable and general approach to differentially private synthetic data}, author = {Ryan McKenna and Gerome Miklau and Daniel Sheldon}, doi = {https://doi.org/10.29012/jpc.778}, year = {2021}, date = {2021-12-01}, journal = {Journal of Privacy and Confidentiality}, volume = {11}, number = {3}, abstract = {We propose a general approach for differentially private synthetic data generation, that consists of three steps: (1) select a collection of low-dimensional marginals, (2) measure those marginals with a noise addition mechanism, and (3) generate synthetic data that preserves the measured marginals well. Central to this approach is Private-PGM, a post-processing method that is used to estimate a high-dimensional data distribution from noisy measurements of its marginals. We present two mechanisms, NIST-MST and MST, that are instances of this general approach. NIST-MST was the winning mechanism in the 2018 NIST differential privacy synthetic data competition, and MST is a new mechanism that can work in more general settings, while still performing comparably to NIST-MST. We believe our general approach should be of broad interest, and can be adopted in future mechanisms for synthetic data generation.}, keywords = {}, pubstate = {published}, tppubtype = {article} } We propose a general approach for differentially private synthetic data generation, that consists of three steps: (1) select a collection of low-dimensional marginals, (2) measure those marginals with a noise addition mechanism, and (3) generate synthetic data that preserves the measured marginals well. Central to this approach is Private-PGM, a post-processing method that is used to estimate a high-dimensional data distribution from noisy measurements of its marginals. We present two mechanisms, NIST-MST and MST, that are instances of this general approach. NIST-MST was the winning mechanism in the 2018 NIST differential privacy synthetic data competition, and MST is a new mechanism that can work in more general settings, while still performing comparably to NIST-MST. We believe our general approach should be of broad interest, and can be adopted in future mechanisms for synthetic data generation. |
Ryan McKenna, Siddhant Pradhan, Daniel Sheldon, Gerome Miklau Relaxed marginal consistency for differentially private query answering Inproceedings Advances in Neural Information Processing Systems, pp. 20696-20707, 2021. @inproceedings{McKenna2021relaxed, title = {Relaxed marginal consistency for differentially private query answering}, author = {Ryan McKenna and Siddhant Pradhan and Daniel Sheldon and Gerome Miklau}, url = {https://proceedings.neurips.cc/paper/2021/hash/acb55f9af76808c5fd5522dcdb519fde-Abstract.html}, year = {2021}, date = {2021-12-01}, booktitle = {Advances in Neural Information Processing Systems}, volume = {34}, pages = {20696-20707}, abstract = {Many differentially private algorithms for answering database queries involve astep that reconstructs a discrete data distribution from noisy measurements. Thisprovides consistent query answers and reduces error, but often requires space thatgrows exponentially with dimension. PRIVATE-PGM is a recent approach that usesgraphical models to represent the data distribution, with complexity proportional tothat of exact marginal inference in a graphical model with structure determined bythe co-occurrence of variables in the noisy measurements. PRIVATE-PGM is highlyscalable for sparse measurements, but may fail to run in high dimensions with densemeasurements. We overcome the main scalability limitation of PRIVATE-PGMthrough a principled approach that relaxes consistency constraints in the estimationobjective. Our new approach works with many existing private query answeringalgorithms and improves scalability or accuracy with no privacy cost.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Many differentially private algorithms for answering database queries involve astep that reconstructs a discrete data distribution from noisy measurements. Thisprovides consistent query answers and reduces error, but often requires space thatgrows exponentially with dimension. PRIVATE-PGM is a recent approach that usesgraphical models to represent the data distribution, with complexity proportional tothat of exact marginal inference in a graphical model with structure determined bythe co-occurrence of variables in the noisy measurements. PRIVATE-PGM is highlyscalable for sparse measurements, but may fail to run in high dimensions with densemeasurements. We overcome the main scalability limitation of PRIVATE-PGMthrough a principled approach that relaxes consistency constraints in the estimationobjective. Our new approach works with many existing private query answeringalgorithms and improves scalability or accuracy with no privacy cost. |
Yifei Yang, Matt Youill, Matthew Woicik, Yizhou Liu, Xiangyao Yu, Marco Serafini, Ashraf Aboulnaga, Michael Stonebraker FlexPushdownDB: Hybrid Pushdown and Caching in a Cloud DBMS Inproceedings Proceedings of the VLDB Endowment, pp. 2101–2113, 2021. @inproceedings{yang2021flexpushdowndb, title = {FlexPushdownDB: Hybrid Pushdown and Caching in a Cloud DBMS}, author = {Yifei Yang and Matt Youill and Matthew Woicik and Yizhou Liu and Xiangyao Yu and Marco Serafini and Ashraf Aboulnaga and Michael Stonebraker}, doi = {https://doi.org/10.14778/3476249.3476265}, year = {2021}, date = {2021-06-01}, booktitle = {Proceedings of the VLDB Endowment}, volume = {14}, number = {11}, pages = { 2101–2113}, abstract = {Modern cloud databases adopt a storage-disaggregation architecture that separates the management of computation and storage. A major bottleneck in such an architecture is the network connecting the computation and storage layers. Two solutions have been explored to mitigate the bottleneck: caching and computation pushdown. While both techniques can significantly reduce network traffic, existing DBMSs consider them as orthogonal techniques and support only one or the other, leaving potential performance benefits unexploited. In this paper we present FlexPushdownDB (FPDB), an OLAP cloud DBMS prototype that supports fine-grained hybrid query execution to combine the benefits of caching and computation pushdown in a storage-disaggregation architecture. We build a hybrid query executor based on a new concept called separable operators to combine the data from the cache and results from the pushdown processing. We also propose a novel Weighted-LFU cache replacement policy that takes into account the cost of pushdown computation. Our experimental evaluation on the Star Schema Benchmark shows that the hybrid execution outperforms both the conventional caching-only architecture and pushdown-only architecture by 2.2X. In the hybrid architecture, our experiments show that Weighted-LFU can outperform the baseline LFU by 37%}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Modern cloud databases adopt a storage-disaggregation architecture that separates the management of computation and storage. A major bottleneck in such an architecture is the network connecting the computation and storage layers. Two solutions have been explored to mitigate the bottleneck: caching and computation pushdown. While both techniques can significantly reduce network traffic, existing DBMSs consider them as orthogonal techniques and support only one or the other, leaving potential performance benefits unexploited. In this paper we present FlexPushdownDB (FPDB), an OLAP cloud DBMS prototype that supports fine-grained hybrid query execution to combine the benefits of caching and computation pushdown in a storage-disaggregation architecture. We build a hybrid query executor based on a new concept called separable operators to combine the data from the cache and results from the pushdown processing. We also propose a novel Weighted-LFU cache replacement policy that takes into account the cost of pushdown computation. Our experimental evaluation on the Star Schema Benchmark shows that the hybrid execution outperforms both the conventional caching-only architecture and pushdown-only architecture by 2.2X. In the hybrid architecture, our experiments show that Weighted-LFU can outperform the baseline LFU by 37% |
Abhinav Jangda, Sandeep Polisetty, Arjun Guha, Marco Serafini Accelerating graph sampling for graph machine learning using GPUs Inproceedings EuroSys '21: Proceedings of the Sixteenth European Conference on Computer Systems, pp. 311–326, 2021. @inproceedings{Jangda2021accelerating, title = {Accelerating graph sampling for graph machine learning using GPUs}, author = {Abhinav Jangda and Sandeep Polisetty and Arjun Guha and Marco Serafini}, doi = {https://doi.org/10.1145/3447786.3456244}, year = {2021}, date = {2021-04-01}, booktitle = {EuroSys '21: Proceedings of the Sixteenth European Conference on Computer Systems}, pages = {311–326}, abstract = {Representation learning algorithms automatically learn the features of data. Several representation learning algorithms for graph data, such as DeepWalk, node2vec, and Graph-SAGE, sample the graph to produce mini-batches that are suitable for training a DNN. However, sampling time can be a significant fraction of training time, and existing systems do not efficiently parallelize sampling. Sampling is an "embarrassingly parallel" problem and may appear to lend itself to GPU acceleration, but the irregularity of graphs makes it hard to use GPU resources effectively. This paper presents NextDoor, a system designed to effectively perform graph sampling on GPUs. NextDoor employs a new approach to graph sampling that we call transit-parallelism, which allows load balancing and caching of edges. NextDoor provides end-users with a high-level abstraction for writing a variety of graph sampling algorithms. We implement several graph sampling applications, and show that NextDoor runs them orders of magnitude faster than existing systems.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Representation learning algorithms automatically learn the features of data. Several representation learning algorithms for graph data, such as DeepWalk, node2vec, and Graph-SAGE, sample the graph to produce mini-batches that are suitable for training a DNN. However, sampling time can be a significant fraction of training time, and existing systems do not efficiently parallelize sampling. Sampling is an "embarrassingly parallel" problem and may appear to lend itself to GPU acceleration, but the irregularity of graphs makes it hard to use GPU resources effectively. This paper presents NextDoor, a system designed to effectively perform graph sampling on GPUs. NextDoor employs a new approach to graph sampling that we call transit-parallelism, which allows load balancing and caching of edges. NextDoor provides end-users with a high-level abstraction for writing a variety of graph sampling algorithms. We implement several graph sampling applications, and show that NextDoor runs them orders of magnitude faster than existing systems. |
Marco Serafini, Hui Guan Scalable graph neural network training: The case for sampling Journal Article ACM SIGOPS Operating Systems Review, 55 (1), pp. 68-76, 2021. @article{serafini2021scalable, title = {Scalable graph neural network training: The case for sampling}, author = {Marco Serafini and Hui Guan}, doi = {https://doi.org/10.1145/3469379.3469387}, year = {2021}, date = {2021-01-01}, journal = {ACM SIGOPS Operating Systems Review}, volume = {55}, number = {1}, pages = {68-76}, abstract = {Graph Neural Networks (GNNs) are a new and increasingly popular family of deep neural network architectures to perform learning on graphs. Training them efficiently is challenging due to the irregular nature of graph data. The problem becomes even more challenging when scaling to large graphs that exceed the capacity of single devices. Standard approaches to distributed DNN training, like data and model parallelism, do not directly apply to GNNs. Instead, two different approaches have emerged in the literature: whole-graph and sample-based training. In this paper, we review and compare the two approaches. Scalability is challenging with both approaches, but we make a case that research should focus on sample-based training since it is a more promising approach. Finally, we review recent systems supporting sample-based training.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Graph Neural Networks (GNNs) are a new and increasingly popular family of deep neural network architectures to perform learning on graphs. Training them efficiently is challenging due to the irregular nature of graph data. The problem becomes even more challenging when scaling to large graphs that exceed the capacity of single devices. Standard approaches to distributed DNN training, like data and model parallelism, do not directly apply to GNNs. Instead, two different approaches have emerged in the literature: whole-graph and sample-based training. In this paper, we review and compare the two approaches. Scalability is challenging with both approaches, but we make a case that research should focus on sample-based training since it is a more promising approach. Finally, we review recent systems supporting sample-based training. |
Fei Song, Khaled Zaouk, Chenghao Lyu, Arnab Sinha, Qi Fan, Yanlei Diao, Prashant J Shenoy Spark-based Cloud Data Analytics using Multi-Objective Optimization Inproceedings 37th IEEE International Conference on Data Engineering, ICDE 2021, Chania, Greece, April 19-22, 2021, pp. 396–407, IEEE, 2021. @inproceedings{DBLP:conf/icde/SongZLSFDS21, title = {Spark-based Cloud Data Analytics using Multi-Objective Optimization}, author = {Fei Song and Khaled Zaouk and Chenghao Lyu and Arnab Sinha and Qi Fan and Yanlei Diao and Prashant J Shenoy}, url = {https://doi.org/10.1109/ICDE51399.2021.00041 http://scalla.cs.umass.edu/papers/udao2020.pdf}, doi = {10.1109/ICDE51399.2021.00041}, year = {2021}, date = {2021-01-01}, booktitle = {37th IEEE International Conference on Data Engineering, ICDE 2021, Chania, Greece, April 19-22, 2021}, pages = {396--407}, publisher = {IEEE}, abstract = {Data analytics in the cloud has become an integral part of enterprise businesses. Big data analytics systems, however, still lack the ability to take task objectives such as user performance goals and budgetary constraints and automatically configure an analytic job to achieve these objectives. This paper presents UDAO, a Spark-based Unified Data Analytics Optimizer that can automatically determine a cluster configuration with a suitable number of cores as well as other system parameters that best meet the task objectives. At a core of our work is a principled multi-objective optimization (MOO) approach that computes a Pareto optimal set of configurations to reveal tradeoffs between different objectives, recommends a new Spark configuration that best explores such tradeoffs, and employs novel optimizations to enable such recommendations within a few seconds. Detailed experiments using benchmark workloads show that our MOO techniques provide a 2-50x speedup over existing MOO methods, while offering good coverage of the Pareto frontier. Compared to Ottertune, a state-of-the-art performance tuning system, UDAO recommends Spark configurations that yield 26%-49% reduction of running time of the TPCx-BB benchmark while adapting to different user preferences on multiple objectives.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data analytics in the cloud has become an integral part of enterprise businesses. Big data analytics systems, however, still lack the ability to take task objectives such as user performance goals and budgetary constraints and automatically configure an analytic job to achieve these objectives. This paper presents UDAO, a Spark-based Unified Data Analytics Optimizer that can automatically determine a cluster configuration with a suitable number of cores as well as other system parameters that best meet the task objectives. At a core of our work is a principled multi-objective optimization (MOO) approach that computes a Pareto optimal set of configurations to reveal tradeoffs between different objectives, recommends a new Spark configuration that best explores such tradeoffs, and employs novel optimizations to enable such recommendations within a few seconds. Detailed experiments using benchmark workloads show that our MOO techniques provide a 2-50x speedup over existing MOO methods, while offering good coverage of the Pareto frontier. Compared to Ottertune, a state-of-the-art performance tuning system, UDAO recommends Spark configurations that yield 26%-49% reduction of running time of the TPCx-BB benchmark while adapting to different user preferences on multiple objectives. |
Zafeiria Moumoulidou, Andrew McGregor, Alexandra Meliou Diverse Data Selection under Fairness Constraints Inproceedings International Conference on Database Theory, (ICDT), pp. 11:1–11:25, 2021. @inproceedings{MoumoulidouMM21, title = {Diverse Data Selection under Fairness Constraints}, author = {Zafeiria Moumoulidou and Andrew McGregor and Alexandra Meliou}, url = {https://arxiv.org/pdf/2010.09141.pdf}, year = {2021}, date = {2021-03-23}, booktitle = {International Conference on Database Theory, (ICDT)}, pages = {11:1--11:25}, abstract = {Diversity is an important principle in data selection and summarization, facility location, and recommendation systems. Our work focuses on maximizing diversity in data selection, while offering fairness guarantees. In particular, we offer the first study that augments the Max-Min diversification objective with fairness constraints. More specifically, given a universe U of n elements that can be partitioned into m disjoint groups, we aim to retrieve a k-sized subset that maximizes the pairwise minimum distance within the set (diversity) and contains a pre-specified ki number of elements from each group i (fairness). We show that this problem is NP-complete even in metric spaces, and we propose three novel algorithms, linear in n, that provide strong theoretical approximation guarantees for different values of m and k. Finally, we extend our algorithms and analysis to the case where groups can be overlapping.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Diversity is an important principle in data selection and summarization, facility location, and recommendation systems. Our work focuses on maximizing diversity in data selection, while offering fairness guarantees. In particular, we offer the first study that augments the Max-Min diversification objective with fairness constraints. More specifically, given a universe U of n elements that can be partitioned into m disjoint groups, we aim to retrieve a k-sized subset that maximizes the pairwise minimum distance within the set (diversity) and contains a pre-specified ki number of elements from each group i (fairness). We show that this problem is NP-complete even in metric spaces, and we propose three novel algorithms, linear in n, that provide strong theoretical approximation guarantees for different values of m and k. Finally, we extend our algorithms and analysis to the case where groups can be overlapping. |
Anna Fariha, Ashish Tiwari, Arjun Radhakrishna, Sumit Gulwani, Alexandra Meliou Conformance Constraint Discovery: Measuring Trust in Data-Driven Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. @inproceedings{FarihaTMRG21b, title = {Conformance Constraint Discovery: Measuring Trust in Data-Driven Systems}, author = {Anna Fariha and Ashish Tiwari and Arjun Radhakrishna and Sumit Gulwani and Alexandra Meliou}, url = {https://afariha.github.io/papers/Conformance_Constraints_SIGMOD_2021.pdf}, doi = {10.1145/3448016.3452795}, year = {2021}, date = {2021-06-18}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, abstract = {The reliability of inferences made by data-driven systems hinges on the data’s continued conformance to the systems’ initial settings and assumptions. When serving data (on which we want to apply inference) deviates from the profile of the initial training data, the outcome of inference becomes unreliable. We introduce conformance constraints, a new data profiling primitive tailored towards quantifying the degree of non-conformance, which can effectively characterize if inference over that tuple is untrustworthy. Conformance constraints are constraints over certain arithmetic expressions (called projections) involving the numerical attributes of a dataset, which existing data profiling primitives such as functional dependencies and denial constraints cannot model. Our key finding is that projections that incur low variance on a dataset construct effective conformance constraints. This principle yields the surprising result that lowvariance components of a principal component analysis, which are usually discarded for dimensionality reduction, generate stronger conformance constraints than the high-variance components. Based on this result, we provide a highly scalable and efficient technique—linear in data size and cubic in the number of attributes—for discovering conformance constraints for a dataset. To measure the degree of a tuple’s non-conformance with respect to a dataset, we propose a quantitative semantics that captures how much a tuple violates the conformance constraints of that dataset. We demonstrate the value of conformance constraints on two applications: trusted machine learning and data drift. We empirically show that conformance constraints offer mechanisms to (1) reliably detect tuples on which the inference of a machine-learned model should not be trusted, and (2) quantify data drift more accurately than the state of the art.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The reliability of inferences made by data-driven systems hinges on the data’s continued conformance to the systems’ initial settings and assumptions. When serving data (on which we want to apply inference) deviates from the profile of the initial training data, the outcome of inference becomes unreliable. We introduce conformance constraints, a new data profiling primitive tailored towards quantifying the degree of non-conformance, which can effectively characterize if inference over that tuple is untrustworthy. Conformance constraints are constraints over certain arithmetic expressions (called projections) involving the numerical attributes of a dataset, which existing data profiling primitives such as functional dependencies and denial constraints cannot model. Our key finding is that projections that incur low variance on a dataset construct effective conformance constraints. This principle yields the surprising result that lowvariance components of a principal component analysis, which are usually discarded for dimensionality reduction, generate stronger conformance constraints than the high-variance components. Based on this result, we provide a highly scalable and efficient technique—linear in data size and cubic in the number of attributes—for discovering conformance constraints for a dataset. To measure the degree of a tuple’s non-conformance with respect to a dataset, we propose a quantitative semantics that captures how much a tuple violates the conformance constraints of that dataset. We demonstrate the value of conformance constraints on two applications: trusted machine learning and data drift. We empirically show that conformance constraints offer mechanisms to (1) reliably detect tuples on which the inference of a machine-learned model should not be trusted, and (2) quantify data drift more accurately than the state of the art. |
Anna Fariha, Ashish Tiwari, Alexandra Meliou, Arjun Radhakrishna, Sumit Gulwani CoCo: Interactive Exploration of Conformance Constraints for Data Understanding and Data Cleaning Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. @inproceedings{FarihaTMRG21demo, title = {CoCo: Interactive Exploration of Conformance Constraints for Data Understanding and Data Cleaning}, author = {Anna Fariha and Ashish Tiwari and Alexandra Meliou and Arjun Radhakrishna and Sumit Gulwani}, url = {https://afariha.github.io/papers/CoCo_SIGMOD_2021_Demo.pdf}, doi = {10.1145/3448016.3452750}, year = {2021}, date = {2021-06-18}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, abstract = {Data profiling refers to the task of extracting technical metadata or profiles and has numerous applications such as data understanding, validation, integration, and cleaning. While a number of data profiling primitives exist in the literature, most of them are limited to categorical attributes. A few techniques consider numerical attributes; but, they either focus on simple relationships involving a pair of attributes (e.g., correlations) or convert the continuous semantics of numerical attributes to a discrete semantics, which results in information loss. To capture more complex relationships involving the numerical attributes, we developed a new data-profiling primitive called conformance constraints, which can model linear arithmetic relationships involving multiple numerical attributes. We present CoCo, a system that allows interactive discovery and exploration of Conformance Constraints for understanding trends involving the numerical attributes of a dataset, with a particular focus on the application of data cleaning. Through a simple interface, CoCo enables the user to guide conformance constraint discovery according to their preferences. The user can examine to what extent a new, possibly dirty, dataset satisfies or violates the discovered conformance constraints. Further, CoCo provides useful suggestions for cleaning dirty data tuples, where the user can interactively alter cell values, and verify by checking change in conformance constraint violation due to the alteration. We demonstrate how CoCo can help in understanding trends in the data and assist the users in interactive data cleaning, using conformance constraints}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data profiling refers to the task of extracting technical metadata or profiles and has numerous applications such as data understanding, validation, integration, and cleaning. While a number of data profiling primitives exist in the literature, most of them are limited to categorical attributes. A few techniques consider numerical attributes; but, they either focus on simple relationships involving a pair of attributes (e.g., correlations) or convert the continuous semantics of numerical attributes to a discrete semantics, which results in information loss. To capture more complex relationships involving the numerical attributes, we developed a new data-profiling primitive called conformance constraints, which can model linear arithmetic relationships involving multiple numerical attributes. We present CoCo, a system that allows interactive discovery and exploration of Conformance Constraints for understanding trends involving the numerical attributes of a dataset, with a particular focus on the application of data cleaning. Through a simple interface, CoCo enables the user to guide conformance constraint discovery according to their preferences. The user can examine to what extent a new, possibly dirty, dataset satisfies or violates the discovered conformance constraints. Further, CoCo provides useful suggestions for cleaning dirty data tuples, where the user can interactively alter cell values, and verify by checking change in conformance constraint violation due to the alteration. We demonstrate how CoCo can help in understanding trends in the data and assist the users in interactive data cleaning, using conformance constraints |
2020 |
Muhammad Bilal, Marco Serafini, Marco Canini, Rodrigo Rodrigues Do the Best Cloud Configurations Grow on Trees? An Experimental Evaluation of Black Box Algorithms for Optimizing Cloud Workloads Sub Journal Article Proc. VLDB Endow., 13 (11), pp. 2563–2575, 2020. @article{DBLP:journals/pvldb/0007SCR20, title = {Do the Best Cloud Configurations Grow on Trees? An Experimental Evaluation of Black Box Algorithms for Optimizing Cloud Workloads Sub}, author = {Muhammad Bilal and Marco Serafini and Marco Canini and Rodrigo Rodrigues}, url = {http://www.vldb.org/pvldb/vol13/p2563-bilal.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {11}, pages = {2563--2575}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Miro Mannino, Azza Abouzied, Peter J Haas, Alexandra Meliou sPaQLTooLs: A Stochastic Package Query Interface for Scalable Constrained Optimization Journal Article Proc. VLDB Endow., 13 (12), pp. 2881–2884, 2020. @article{DBLP:journals/pvldb/BrucatoMAHM20, title = {sPaQLTooLs: A Stochastic Package Query Interface for Scalable Constrained Optimization}, author = {Matteo Brucato and Miro Mannino and Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol13/p2881-brucato.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {12}, pages = {2881--2884}, abstract = {Everyone needs to make decisions under uncertainty and with limited resources, e.g., an investor who is building a stock portfolio subject to an investment budget and a bounded risk tolerance. Doing this with current technology is hard. There is a disconnect between software tools for data management, stochastic predictive modeling (e.g., simulation of future stock prices), and optimization; this leads to cumbersome analytical workflows. Moreover, current methods do not scale. To handle a broad class of uncertainty models, analysts approximate the original stochastic optimization problem by a large deterministic optimization problem that incorporates many “scenarios”, i.e., sample realizations of the uncertain data values. For large problems, a huge number of scenarios is required, often causing the solver to fail. We demonstrate sPaQLTooLs, a system for in-database specification and scalable solution of constrained optimization problems. The key ingredients are (i) a database-oriented specification of constrained stochastic optimization problems as “stochastic package queries” (SPQs), (ii) use of a Monte Carlo database to incorporate stochastic predictive models, and (iii) a new SUMMARYSEARCH algorithm for scalably solving SPQs with approximation guarantees. In this demonstration, the attendees will experience first-hand the difficulty of manually constructing feasible and high-quality portfolios, using real-world stock market data. We will then demonstrate how SUMMARYSEARCH can easily and efficiently help them find very good portfolios, while being orders of magnitude faster than prior methods.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Everyone needs to make decisions under uncertainty and with limited resources, e.g., an investor who is building a stock portfolio subject to an investment budget and a bounded risk tolerance. Doing this with current technology is hard. There is a disconnect between software tools for data management, stochastic predictive modeling (e.g., simulation of future stock prices), and optimization; this leads to cumbersome analytical workflows. Moreover, current methods do not scale. To handle a broad class of uncertainty models, analysts approximate the original stochastic optimization problem by a large deterministic optimization problem that incorporates many “scenarios”, i.e., sample realizations of the uncertain data values. For large problems, a huge number of scenarios is required, often causing the solver to fail. We demonstrate sPaQLTooLs, a system for in-database specification and scalable solution of constrained optimization problems. The key ingredients are (i) a database-oriented specification of constrained stochastic optimization problems as “stochastic package queries” (SPQs), (ii) use of a Monte Carlo database to incorporate stochastic predictive models, and (iii) a new SUMMARYSEARCH algorithm for scalably solving SPQs with approximation guarantees. In this demonstration, the attendees will experience first-hand the difficulty of manually constructing feasible and high-quality portfolios, using real-world stock market data. We will then demonstrate how SUMMARYSEARCH can easily and efficiently help them find very good portfolios, while being orders of magnitude faster than prior methods. |
Anna Fariha, Matteo Brucato, Peter J Haas, Alexandra Meliou SuDocu: Summarizing Documents by Example Journal Article Proc. VLDB Endow., 13 (12), pp. 2861–2864, 2020. @article{DBLP:journals/pvldb/FarihaBHM20, title = {SuDocu: Summarizing Documents by Example}, author = {Anna Fariha and Matteo Brucato and Peter J Haas and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol13/p2861-fariha.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {12}, pages = {2861--2864}, abstract = {Text document summarization refers to the task of producing a brief representation of a document for easy human consumption. Existing text summarization techniques mostly focus on generic summarization, but users often require personalized summarization that targets their specific preferences and needs. However, precisely expressing preferences is challenging, and current methods are often ambiguous, outside the user’s control, or require costly training data. We propose a novel and effective way to express summarization intent (preferences) via examples: the user provides a few example summaries for a small number of documents in a collection, and the system summarizes the rest. We demonstrate SUDOCU, an example-based personalized DOCUment SUmmarization system. Through a simple interface, SUDOCU allows the users to provide example summaries, learns the summarization intent from the examples, and produces summaries for new documents that reflect the user’s summarization intent. SUDOCU further explains the captured summarization intent in the form of a package query, an extension of a traditional SQL query that handles complex constraints and preferences over answer sets. SUDOCU combines topic modeling, semantic similarity discovery, and in-database optimization in a novel way to achieve example-driven document summarization. We demonstrate how SUDOCU can detect complex summarization intents from a few example summaries and produce accurate summaries for new documents effectively and efficiently.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Text document summarization refers to the task of producing a brief representation of a document for easy human consumption. Existing text summarization techniques mostly focus on generic summarization, but users often require personalized summarization that targets their specific preferences and needs. However, precisely expressing preferences is challenging, and current methods are often ambiguous, outside the user’s control, or require costly training data. We propose a novel and effective way to express summarization intent (preferences) via examples: the user provides a few example summaries for a small number of documents in a collection, and the system summarizes the rest. We demonstrate SUDOCU, an example-based personalized DOCUment SUmmarization system. Through a simple interface, SUDOCU allows the users to provide example summaries, learns the summarization intent from the examples, and produces summaries for new documents that reflect the user’s summarization intent. SUDOCU further explains the captured summarization intent in the form of a package query, an extension of a traditional SQL query that handles complex constraints and preferences over answer sets. SUDOCU combines topic modeling, semantic similarity discovery, and in-database optimization in a novel way to achieve example-driven document summarization. We demonstrate how SUDOCU can detect complex summarization intents from a few example summaries and produce accurate summaries for new documents effectively and efficiently. |
Dan Zhang, Yoshihiko Suhara, Jinfeng Li, Madelon Hulsebos, Ç, Wang - Sato: Contextual Semantic Type Detection in Tables Journal Article Proc. VLDB Endow., 13 (11), pp. 1835–1848, 2020. @article{DBLP:journals/pvldb/ZhangSLHDT20, title = {Sato: Contextual Semantic Type Detection in Tables}, author = {Dan Zhang and Yoshihiko Suhara and Jinfeng Li and Madelon Hulsebos and Ç and Wang -}, url = {http://www.vldb.org/pvldb/vol13/p1835-zhang.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {11}, pages = {1835--1848}, abstract = {Detecting the semantic types of data columns in relational tables is important for various data preparation and information retrieval tasks such as data cleaning, schema matching, data discovery, and semantic search. However, existing detection approaches either perform poorly with dirty data, support only a limited number of semantic types, fail to incorporate the table context of columns or rely on large sample sizes for training data. We introduce Sato, a hybrid machine learning model to automatically detect the semantic types of columns in tables, exploiting the signals from the table context as well as the column values. Sato combines a deep learning model trained on a large-scale table corpus with topic modeling and structured prediction to achieve support-weighted and macro average F1 scores of 0.925 and 0.735, respectively, exceeding the state-of-theart performance by a significant margin. We extensively analyze the overall and per-type performance of Sato, discussing how individual modeling components, as well as feature categories, contribute to its performance.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Detecting the semantic types of data columns in relational tables is important for various data preparation and information retrieval tasks such as data cleaning, schema matching, data discovery, and semantic search. However, existing detection approaches either perform poorly with dirty data, support only a limited number of semantic types, fail to incorporate the table context of columns or rely on large sample sizes for training data. We introduce Sato, a hybrid machine learning model to automatically detect the semantic types of columns in tables, exploiting the signals from the table context as well as the column values. Sato combines a deep learning model trained on a large-scale table corpus with topic modeling and structured prediction to achieve support-weighted and macro average F1 scores of 0.925 and 0.735, respectively, exceeding the state-of-theart performance by a significant margin. We extensively analyze the overall and per-type performance of Sato, discussing how individual modeling components, as well as feature categories, contribute to its performance. |
Xiangyao Yu, Matt Youill, Matthew E Woicik, Abdurrahman Ghanem, Marco Serafini, Ashraf Aboulnaga, Michael Stonebraker PushdownDB: Accelerating a DBMS Using S3 Computation Inproceedings 36th IEEE International Conference on Data Engineering, ICDE 2020, Dallas, TX, USA, April 20-24, 2020, pp. 1802–1805, 2020. @inproceedings{DBLP:conf/icde/YuYWGSAS20, title = {PushdownDB: Accelerating a DBMS Using S3 Computation}, author = {Xiangyao Yu and Matt Youill and Matthew E Woicik and Abdurrahman Ghanem and Marco Serafini and Ashraf Aboulnaga and Michael Stonebraker}, url = {https://doi.org/10.1109/ICDE48307.2020.00174 https://marcoserafini.github.io/papers/pushdown.pdf}, doi = {10.1109/ICDE48307.2020.00174}, year = {2020}, date = {2020-01-01}, booktitle = {36th IEEE International Conference on Data Engineering, ICDE 2020, Dallas, TX, USA, April 20-24, 2020}, pages = {1802--1805}, crossref = {DBLP:conf/icde/2020}, abstract = {This paper studies the effectiveness of pushing parts of DBMS analytics queries into the Simple Storage Service (S3) of Amazon Web Services (AWS), using a recently released capability called S3 Select. We show that some DBMS primitives (filter, projection, and aggregation) can always be cost-effectively moved into S3. Other more complex operations (join, top-K, and groupby) require reimplementation to take advantage of S3 Select and are often candidates for pushdown. We demonstrate these capabilities through experimentation using a new DBMS that we developed, PushdownDB. Experimentation with a collection of queries including TPC-H queries shows that PushdownDB is on average 30% cheaper and 6.7× faster than a baseline that does not use S3 Select.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper studies the effectiveness of pushing parts of DBMS analytics queries into the Simple Storage Service (S3) of Amazon Web Services (AWS), using a recently released capability called S3 Select. We show that some DBMS primitives (filter, projection, and aggregation) can always be cost-effectively moved into S3. Other more complex operations (join, top-K, and groupby) require reimplementation to take advantage of S3 Select and are often candidates for pushdown. We demonstrate these capabilities through experimentation using a new DBMS that we developed, PushdownDB. Experimentation with a collection of queries including TPC-H queries shows that PushdownDB is on average 30% cheaper and 6.7× faster than a baseline that does not use S3 Select. |
Xiaowei Zhu, Marco Serafini, Xiaosong Ma, Ashraf Aboulnaga, Wenguang Chen, Guanyu Feng LiveGraph: A Transactional Graph Storage System with Purely Sequential Adjacency List Scans Journal Article Proc. VLDB Endow., 13 (7), pp. 1020–1034, 2020. @article{DBLP:journals/pvldb/ZhuSMACF20, title = {LiveGraph: A Transactional Graph Storage System with Purely Sequential Adjacency List Scans}, author = {Xiaowei Zhu and Marco Serafini and Xiaosong Ma and Ashraf Aboulnaga and Wenguang Chen and Guanyu Feng}, url = {http://www.vldb.org/pvldb/vol13/p1020-zhu.pdf}, year = {2020}, date = {2020-01-01}, journal = {Proc. VLDB Endow.}, volume = {13}, number = {7}, pages = {1020--1034}, abstract = {The specific characteristics of graph workloads make it hard to design a one-size-fits-all graph storage system. Systems that support transactional updates use data structures with poor data locality, which limits the efficiency of analytical workloads or even simple edge scans. Other systems run graph analytics workloads efficiently, but cannot properly support transactions. This paper presents LiveGraph, a graph storage system that outperforms both the best graph transactional systems and the best solutions for real-time graph analytics on fresh data. LiveGraph achieves this by ensuring that adjacency list scans, a key operation in graph workloads, are purely sequential: they never require random accesses even in presence of concurrent transactions. Such pure-sequential operations are enabled by combining a novel graph-aware data structure, the Transactional Edge Log (TEL), with a concurrency control mechanism that leverages TEL’s data layout. Our evaluation shows that LiveGraph significantly outperforms state-of-the-art (graph) database solutions on both transactional and real-time analytical workloads.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The specific characteristics of graph workloads make it hard to design a one-size-fits-all graph storage system. Systems that support transactional updates use data structures with poor data locality, which limits the efficiency of analytical workloads or even simple edge scans. Other systems run graph analytics workloads efficiently, but cannot properly support transactions. This paper presents LiveGraph, a graph storage system that outperforms both the best graph transactional systems and the best solutions for real-time graph analytics on fresh data. LiveGraph achieves this by ensuring that adjacency list scans, a key operation in graph workloads, are purely sequential: they never require random accesses even in presence of concurrent transactions. Such pure-sequential operations are enabled by combining a novel graph-aware data structure, the Transactional Edge Log (TEL), with a concurrency control mechanism that leverages TEL’s data layout. Our evaluation shows that LiveGraph significantly outperforms state-of-the-art (graph) database solutions on both transactional and real-time analytical workloads. |
Cibele Freire, Wolfgang Gatterbauer, Neil Immerman, Alexandra Meliou New Results for the Complexity of Resilience for Binary Conjunctive Queries with Self-Joins Inproceedings Proceedings of the 39th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems, PODS 2020, Portland, OR, USA, June 14-19, 2020, pp. 271–284, 2020. @inproceedings{DBLP:conf/pods/FreireGIM20, title = {New Results for the Complexity of Resilience for Binary Conjunctive Queries with Self-Joins}, author = {Cibele Freire and Wolfgang Gatterbauer and Neil Immerman and Alexandra Meliou}, url = {https://doi.org/10.1145/3375395.3387647 https://people.cs.umass.edu/~ameli/projects/causality/papers/pods2020-Resilience.pdf}, doi = {10.1145/3375395.3387647}, year = {2020}, date = {2020-06-16}, booktitle = {Proceedings of the 39th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems, PODS 2020, Portland, OR, USA, June 14-19, 2020}, pages = {271--284}, crossref = {DBLP:conf/pods/2020}, abstract = {The resilience of a Boolean query on a database is the minimum number of tuples that need to be deleted from the input tables in order to make the query false. A solution to this problem immediately translates into a solution for the more widely known problem of deletion propagation with source-side effects. In this paper, we give several novel results on the hardness of the resilience problem for conjunctive queries with self-joins, and, more specifically, we present a dichotomy result for the class of single-self-join binary queries with exactly two repeated relations occurring in the query. Unlike in the self-join free case, the concept of triad is not enough to fully characterize the complexity of resilience. We identify new structural properties, namely chains, confluences and permutations, which lead to various NP-hardness results. We also give novel involved reductions to network flow to show certain cases are in P. Although restricted, our results provide important insights into the problem of self-joins that we hope can help solve the general case of all conjunctive queries with self-joins in the future.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The resilience of a Boolean query on a database is the minimum number of tuples that need to be deleted from the input tables in order to make the query false. A solution to this problem immediately translates into a solution for the more widely known problem of deletion propagation with source-side effects. In this paper, we give several novel results on the hardness of the resilience problem for conjunctive queries with self-joins, and, more specifically, we present a dichotomy result for the class of single-self-join binary queries with exactly two repeated relations occurring in the query. Unlike in the self-join free case, the concept of triad is not enough to fully characterize the complexity of resilience. We identify new structural properties, namely chains, confluences and permutations, which lead to various NP-hardness results. We also give novel involved reductions to network flow to show certain cases are in P. Although restricted, our results provide important insights into the problem of self-joins that we hope can help solve the general case of all conjunctive queries with self-joins in the future. |
Anna Fariha, Ashish Tiwari, Arjun Radhakrishna, Sumit Gulwani ExTuNe: Explaining Tuple Non-conformance Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 2741–2744, 2020. @inproceedings{DBLP:conf/sigmod/Fariha0RG20, title = {ExTuNe: Explaining Tuple Non-conformance}, author = {Anna Fariha and Ashish Tiwari and Arjun Radhakrishna and Sumit Gulwani}, url = {https://doi.org/10.1145/3318464.3384694 https://people.cs.umass.edu/~afariha/papers/ExTuNe.pdf}, doi = {10.1145/3318464.3384694}, year = {2020}, date = {2020-06-16}, booktitle = {Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020}, pages = {2741--2744}, crossref = {DBLP:conf/sigmod/2020}, abstract = {In data-driven systems, we often encounter tuples on which the predictions of a machine-learned model are untrustworthy. A key cause of such untrustworthiness is non-conformance of a new tuple with respect to the training dataset. To check conformance, we introduce a novel concept of data invariant, which captures a set of implicit constraints that all tuples of a dataset satisfy: a test tuple is non-conforming if it violates the data invariants. Data invariants model complex relationships among multiple attributes; but do not provide interpretable explanations of non-conformance. We present ExTuNe, a system for Explaining causes of Tuple Non-conformance. Based on the principles of causality, ExTuNe assigns responsibility to the attributes for causing non-conformance. The key idea is to observe change in invariant violation under intervention on attribute-values. Through a simple interface, ExTuNe produces a ranked list of the test tuples based on their degree of non-conformance and visualizes tuple-level attribute responsibility for non-conformance through heat maps. ExTuNe further visualizes attribute responsibility, aggregated over the test tuples. We demonstrate how ExTuNe can detect and explain tuple non-conformance and assist the users to make careful decisions towards achieving trusted machine learning.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In data-driven systems, we often encounter tuples on which the predictions of a machine-learned model are untrustworthy. A key cause of such untrustworthiness is non-conformance of a new tuple with respect to the training dataset. To check conformance, we introduce a novel concept of data invariant, which captures a set of implicit constraints that all tuples of a dataset satisfy: a test tuple is non-conforming if it violates the data invariants. Data invariants model complex relationships among multiple attributes; but do not provide interpretable explanations of non-conformance. We present ExTuNe, a system for Explaining causes of Tuple Non-conformance. Based on the principles of causality, ExTuNe assigns responsibility to the attributes for causing non-conformance. The key idea is to observe change in invariant violation under intervention on attribute-values. Through a simple interface, ExTuNe produces a ranked list of the test tuples based on their degree of non-conformance and visualizes tuple-level attribute responsibility for non-conformance through heat maps. ExTuNe further visualizes attribute responsibility, aggregated over the test tuples. We demonstrate how ExTuNe can detect and explain tuple non-conformance and assist the users to make careful decisions towards achieving trusted machine learning. |
Anna Fariha, Suman Nath, Alexandra Meliou Causality-Guided Adaptive Interventional Debugging Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 431–446, 2020. @inproceedings{DBLP:conf/sigmod/FarihaNM20, title = {Causality-Guided Adaptive Interventional Debugging}, author = {Anna Fariha and Suman Nath and Alexandra Meliou}, url = {https://doi.org/10.1145/3318464.3389694 https://people.cs.umass.edu/~afariha/papers/aid.pdf}, doi = {10.1145/3318464.3389694}, year = {2020}, date = {2020-06-15}, booktitle = {Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020}, pages = {431--446}, crossref = {DBLP:conf/sigmod/2020}, abstract = {Runtime nondeterminism is a fact of life in modern database applications. Previous research has shown that nondeterminism can cause applications to intermittently crash, become unresponsive, or experience data corruption. We propose Adaptive Interventional Debugging (AID) for debugging such intermittent failures. AID combines existing statistical debugging, causal analysis, fault injection, and group testing techniques in a novel way to (1) pinpoint the root cause of an application's intermittent failure and (2) generate an explanation of how the root cause triggers the failure. AID works by first identifying a set of runtime behaviors (called predicates) that are strongly correlated to the failure. It then utilizes temporal properties of the predicates to (over)-approximate their causal relationships. Finally, it uses fault injection to execute a sequence of interventions on the predicates and discover their true causal relationships. This enables AID to identify the true root cause and its causal relationship to the failure. We theoretically analyze how fast AID can converge to the identification. We evaluate AID with six real-world applications that intermittently fail under specific inputs. In each case, AID was able to identify the root cause and explain how the root cause triggered the failure, much faster than group testing and more precisely than statistical debugging. We also evaluate AID with many synthetically generated applications with known root causes and confirm that the benefits also hold for them.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Runtime nondeterminism is a fact of life in modern database applications. Previous research has shown that nondeterminism can cause applications to intermittently crash, become unresponsive, or experience data corruption. We propose Adaptive Interventional Debugging (AID) for debugging such intermittent failures. AID combines existing statistical debugging, causal analysis, fault injection, and group testing techniques in a novel way to (1) pinpoint the root cause of an application's intermittent failure and (2) generate an explanation of how the root cause triggers the failure. AID works by first identifying a set of runtime behaviors (called predicates) that are strongly correlated to the failure. It then utilizes temporal properties of the predicates to (over)-approximate their causal relationships. Finally, it uses fault injection to execute a sequence of interventions on the predicates and discover their true causal relationships. This enables AID to identify the true root cause and its causal relationship to the failure. We theoretically analyze how fast AID can converge to the identification. We evaluate AID with six real-world applications that intermittently fail under specific inputs. In each case, AID was able to identify the root cause and explain how the root cause triggered the failure, much faster than group testing and more precisely than statistical debugging. We also evaluate AID with many synthetically generated applications with known root causes and confirm that the benefits also hold for them. |
Matteo Brucato, Nishant Yadav, Azza Abouzied, Peter J Haas, Alexandra Meliou Stochastic Package Queries in Probabilistic Databases Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 269–283, 2020. @inproceedings{DBLP:conf/sigmod/BrucatoYAHM20, title = {Stochastic Package Queries in Probabilistic Databases}, author = {Matteo Brucato and Nishant Yadav and Azza Abouzied and Peter J Haas and Alexandra Meliou}, url = {https://doi.org/10.1145/3318464.3389765 https://people.cs.umass.edu/~matteo/files/3318464.3389765.pdf}, doi = {10.1145/3318464.3389765}, year = {2020}, date = {2020-06-15}, booktitle = {Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020}, pages = {269--283}, crossref = {DBLP:conf/sigmod/2020}, abstract = {We provide methods for in-database support of decision making under uncertainty. Many important decision problems correspond to selecting a "package" (bag of tuples in a relational database) that jointly satisfy a set of constraints while minimizing some overall "cost" function; in most real-world problems, the data is uncertain. We provide methods for specifying---via a SQL extension---and processing stochastic package queries (SPQS), in order to solve optimization problems over uncertain data, right where the data resides. Prior work in stochastic programming uses Monte Carlo methods where the original stochastic optimization problem is approximated by a large deterministic optimization problem that incorporates many "scenarios", i.e., sample realizations of the uncertain data values. For large database tables, however, a huge number of scenarios is required, leading to poor performance and, often, failure of the solver software. We therefore provide a novel ßs algorithm that, instead of trying to solve a large deterministic problem, seamlessly approximates it via a sequence of smaller problems defined over carefully crafted "summaries" of the scenarios that accelerate convergence to a feasible and near-optimal solution. Experimental results on our prototype system show that ßs can be orders of magnitude faster than prior methods at finding feasible and high-quality packages.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We provide methods for in-database support of decision making under uncertainty. Many important decision problems correspond to selecting a "package" (bag of tuples in a relational database) that jointly satisfy a set of constraints while minimizing some overall "cost" function; in most real-world problems, the data is uncertain. We provide methods for specifying---via a SQL extension---and processing stochastic package queries (SPQS), in order to solve optimization problems over uncertain data, right where the data resides. Prior work in stochastic programming uses Monte Carlo methods where the original stochastic optimization problem is approximated by a large deterministic optimization problem that incorporates many "scenarios", i.e., sample realizations of the uncertain data values. For large database tables, however, a huge number of scenarios is required, leading to poor performance and, often, failure of the solver software. We therefore provide a novel ßs algorithm that, instead of trying to solve a large deterministic problem, seamlessly approximates it via a sequence of smaller problems defined over carefully crafted "summaries" of the scenarios that accelerate convergence to a feasible and near-optimal solution. Experimental results on our prototype system show that ßs can be orders of magnitude faster than prior methods at finding feasible and high-quality packages. |
David Pujol, Ryan McKenna, Satya Kuppam, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau Fair decision making using privacy-protected data Inproceedings FAT* '20: Conference on Fairness, Accountability, and Transparency, Barcelona, Spain, January 27-30, 2020, pp. 189–199, 2020. @inproceedings{DBLP:conf/fat/PujolMKHMM20, title = {Fair decision making using privacy-protected data}, author = {David Pujol and Ryan McKenna and Satya Kuppam and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3351095.3372872}, doi = {10.1145/3351095.3372872}, year = {2020}, date = {2020-01-01}, booktitle = {FAT* '20: Conference on Fairness, Accountability, and Transparency, Barcelona, Spain, January 27-30, 2020}, pages = {189--199}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Dan Zhang, Ryan McKenna, Ios Kotsogiannis, George Bissias, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau ϵKTELO: A Framework for Defining Differentially Private Computations Journal Article ACM Trans. Database Syst., 45 (1), pp. 2:1–2:44, 2020. @article{DBLP:journals/tods/ZhangMKBHMM20, title = {ϵKTELO: A Framework for Defining Differentially Private Computations}, author = {Dan Zhang and Ryan McKenna and Ios Kotsogiannis and George Bissias and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3362032}, doi = {10.1145/3362032}, year = {2020}, date = {2020-01-01}, journal = {ACM Trans. Database Syst.}, volume = {45}, number = {1}, pages = {2:1--2:44}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
2019 |
Matteo Brucato, Azza Abouzied, Alexandra Meliou Scalable computation of high-order optimization queries Journal Article Commun. ACM, 62 (2), pp. 108–116, 2019. @article{DBLP:journals/cacm/BrucatoAM19, title = {Scalable computation of high-order optimization queries}, author = {Matteo Brucato and Azza Abouzied and Alexandra Meliou}, url = {https://doi.org/10.1145/3299881 https://people.cs.umass.edu/~ameli/projects/packageBuilder/papers/p108-brucato.pdf}, doi = {10.1145/3299881}, year = {2019}, date = {2019-01-01}, journal = {Commun. ACM}, volume = {62}, number = {2}, pages = {108--116}, abstract = {Constrained optimization problems are at the heart of significant applications in a broad range of domains, including finance, transportation, manufacturing, and healthcare. Modeling and solving these problems has relied on application-specific solutions, which are often complex, errorprone, and do not generalize. Our goal is to create a domain-independent, declarative approach, supported and powered by the system where the data relevant to these problems typically resides: the database. We present a complete system that supports package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets, allowing the declarative specification and efficient evaluation of a significant class of constrained optimization problems—integer linear programs (ILP)—within a database.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Constrained optimization problems are at the heart of significant applications in a broad range of domains, including finance, transportation, manufacturing, and healthcare. Modeling and solving these problems has relied on application-specific solutions, which are often complex, errorprone, and do not generalize. Our goal is to create a domain-independent, declarative approach, supported and powered by the system where the data relevant to these problems typically resides: the database. We present a complete system that supports package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets, allowing the declarative specification and efficient evaluation of a significant class of constrained optimization problems—integer linear programs (ILP)—within a database. |
Junjay Tan, Thanaa Ghanem, Matthew Perron, Xiangyao Yu, Michael Stonebraker, David DeWitt, Marco Serafini, Ashraf Aboulnaga, Tim Kraska Choosing A Cloud DBMS: Architectures and Tradeoffs Journal Article Proceedings of the VLDB Endowment, 12 (12), 2019. @article{tan12choosing, title = {Choosing A Cloud DBMS: Architectures and Tradeoffs}, author = {Junjay Tan and Thanaa Ghanem and Matthew Perron and Xiangyao Yu and Michael Stonebraker and David DeWitt and Marco Serafini and Ashraf Aboulnaga and Tim Kraska}, url = {https://blog.acolyer.org/2019/08/30/choosing-a-cloud-dbms/ https://marcoserafini.github.io/papers/cloud-dbms.pdf}, year = {2019}, date = {2019-06-15}, journal = {Proceedings of the VLDB Endowment}, volume = {12}, number = {12}, abstract = {As analytic (OLAP) applications move to the cloud, DBMSs have shifted from employing a pure shared-nothing design with locally attached storage to a hybrid design that combines the use of shared-storage (e.g., AWS S3) with the use of shared-nothing query execution mechanisms. This paper sheds light on the resulting tradeoffs, which have not been properly identified in previous work. To this end, it evaluates the TPC-H benchmark across a variety of DBMS offerings running in a cloud environment (AWS) on fast 10Gb+ networks, specifically database-as-a-service offerings (Redshift, Athena), query engines (Presto, Hive), and a traditional cloud agnostic OLAP database (Vertica). While these comparisons cannot be apples-to-apples in all cases due to cloud configuration restrictions, we nonetheless identify patterns and design choices that are advantageous. These include prioritizing low-cost object stores like S3 for data storage, using system agnostic yet still performant columnar formats like ORC that allow easy switching to other systems for different workloads, and making features that benefit subsequent runs like query precompilation and caching remote data to faster storage optional rather than required because they disadvantage ad hoc queries.}, keywords = {}, pubstate = {published}, tppubtype = {article} } As analytic (OLAP) applications move to the cloud, DBMSs have shifted from employing a pure shared-nothing design with locally attached storage to a hybrid design that combines the use of shared-storage (e.g., AWS S3) with the use of shared-nothing query execution mechanisms. This paper sheds light on the resulting tradeoffs, which have not been properly identified in previous work. To this end, it evaluates the TPC-H benchmark across a variety of DBMS offerings running in a cloud environment (AWS) on fast 10Gb+ networks, specifically database-as-a-service offerings (Redshift, Athena), query engines (Presto, Hive), and a traditional cloud agnostic OLAP database (Vertica). While these comparisons cannot be apples-to-apples in all cases due to cloud configuration restrictions, we nonetheless identify patterns and design choices that are advantageous. These include prioritizing low-cost object stores like S3 for data storage, using system agnostic yet still performant columnar formats like ORC that allow easy switching to other systems for different workloads, and making features that benefit subsequent runs like query precompilation and caching remote data to faster storage optional rather than required because they disadvantage ad hoc queries. |
Anna Fariha, Alexandra Meliou Example-Driven Query Intent Discovery: Abductive Reasoning using Semantic Similarity Journal Article PVLDB, 12 (11), pp. 1262–1275, 2019. @article{DBLP:journals/pvldb/FarihaM19, title = {Example-Driven Query Intent Discovery: Abductive Reasoning using Semantic Similarity}, author = {Anna Fariha and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol12/p1262-fariha.pdf squid.cs.umass.edu https://bitbucket.org/afariha/squid-public/}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {11}, pages = {1262--1275}, abstract = {Traditional relational data interfaces require precise structured queries over potentially complex schemas. These rigid data retrieval mechanisms pose hurdles for non-expert users, who typically lack language expertise and are unfamiliar with the details of the schema. Query by Example (QBE) methods offer an alternative mechanism: users provide examples of their intended query output and the QBE system needs to infer the intended query. However, these approaches focus on the structural similarity of the examples and ignore the richer context present in the data. As a result, they typically produce queries that are too general, and fail to capture the user’s intent effectively. In this paper, we present SQUID, a system that performs semantic similarity-aware query intent discovery. Our work makes the following contributions: (1) We design an end-to-end system that automatically formulates select-project-join queries in an open-world setting, with optional group-by aggregation and intersection operators; a much larger class than prior QBE techniques. (2) We express the problem of query intent discovery using a probabilistic abduction model, that infers a query as the most likely explanation of the provided examples. (3) We introduce the notion of an abduction-ready database, which precomputes semantic properties and related statistics, allowing SQUID to achieve real-time performance. (4) We present an extensive empirical evaluation on three real-world datasets, including user-intent case studies, demonstrating that SQUID is efficient and effective, and outperforms machine learning methods, as well as the state-of the-art in the related query reverse engineering problem. }, keywords = {}, pubstate = {published}, tppubtype = {article} } Traditional relational data interfaces require precise structured queries over potentially complex schemas. These rigid data retrieval mechanisms pose hurdles for non-expert users, who typically lack language expertise and are unfamiliar with the details of the schema. Query by Example (QBE) methods offer an alternative mechanism: users provide examples of their intended query output and the QBE system needs to infer the intended query. However, these approaches focus on the structural similarity of the examples and ignore the richer context present in the data. As a result, they typically produce queries that are too general, and fail to capture the user’s intent effectively. In this paper, we present SQUID, a system that performs semantic similarity-aware query intent discovery. Our work makes the following contributions: (1) We design an end-to-end system that automatically formulates select-project-join queries in an open-world setting, with optional group-by aggregation and intersection operators; a much larger class than prior QBE techniques. (2) We express the problem of query intent discovery using a probabilistic abduction model, that infers a query as the most likely explanation of the provided examples. (3) We introduce the notion of an abduction-ready database, which precomputes semantic properties and related statistics, allowing SQUID to achieve real-time performance. (4) We present an extensive empirical evaluation on three real-world datasets, including user-intent case studies, demonstrating that SQUID is efficient and effective, and outperforms machine learning methods, as well as the state-of the-art in the related query reverse engineering problem. |
Xiaolan Wang, Xin Luna Dong, Yang Li, Alexandra Meliou MIDAS: Finding the Right Web Sources to Fill Knowledge Gaps Inproceedings 35th IEEE International Conference on Data Engineering, ICDE 2019, Macao, China, April 8-11, 2019, pp. 578–589, 2019. @inproceedings{DBLP:conf/icde/WangDLM19, title = {MIDAS: Finding the Right Web Sources to Fill Knowledge Gaps}, author = {Xiaolan Wang and Xin Luna Dong and Yang Li and Alexandra Meliou}, url = {https://doi.org/10.1109/ICDE.2019.00058}, doi = {10.1109/ICDE.2019.00058}, year = {2019}, date = {2019-01-01}, booktitle = {35th IEEE International Conference on Data Engineering, ICDE 2019, Macao, China, April 8-11, 2019}, pages = {578--589}, abstract = {Knowledge bases, massive collections of facts (RDF triples) on diverse topics, support vital modern applications. However, existing knowledge bases contain very little data compared to the wealth of information on the Web. This is because the industry standard in knowledge base creation and augmentation suffers from a serious bottleneck: they rely on domain experts to identify appropriate web sources to extract data from. Efforts to fully automate knowledge extraction have failed to improve this standard: these automated systems are able to retrieve much more data and from a broader range of sources, but they suffer from very low precision and recall. As a result, these large-scale extractions remain unexploited. In this paper, we present MIDAS, a system that harnesses the results of automated knowledge extraction pipelines to repair the bottleneck in industrial knowledge creation and augmentation processes. MIDAS automates the suggestion of good-quality web sources and describes what to extract with respect to augmenting an existing knowledge base. We make three major contributions. First, we introduce a novel concept, web source slices, to describe the contents of a web source. Second, we define a profit function to quantify the value of a web source slice with respect to augmenting an existing knowledge base. Third, we develop effective and highly-scalable algorithms to derive high-profit web source slices. We demonstrate that MIDAS produces high-profit results and outperforms the baselines significantly on both real-world and synthetic datasets.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Knowledge bases, massive collections of facts (RDF triples) on diverse topics, support vital modern applications. However, existing knowledge bases contain very little data compared to the wealth of information on the Web. This is because the industry standard in knowledge base creation and augmentation suffers from a serious bottleneck: they rely on domain experts to identify appropriate web sources to extract data from. Efforts to fully automate knowledge extraction have failed to improve this standard: these automated systems are able to retrieve much more data and from a broader range of sources, but they suffer from very low precision and recall. As a result, these large-scale extractions remain unexploited. In this paper, we present MIDAS, a system that harnesses the results of automated knowledge extraction pipelines to repair the bottleneck in industrial knowledge creation and augmentation processes. MIDAS automates the suggestion of good-quality web sources and describes what to extract with respect to augmenting an existing knowledge base. We make three major contributions. First, we introduce a novel concept, web source slices, to describe the contents of a web source. Second, we define a profit function to quantify the value of a web source slice with respect to augmenting an existing knowledge base. Third, we develop effective and highly-scalable algorithms to derive high-profit web source slices. We demonstrate that MIDAS produces high-profit results and outperforms the baselines significantly on both real-world and synthetic datasets. |
Xiaolan Wang, Alexandra Meliou Explain3D: Explaining Disagreements in Disjoint Datasets Journal Article PVLDB, 12 (7), pp. 779–792, 2019. @article{DBLP:journals/pvldb/WangM19, title = {Explain3D: Explaining Disagreements in Disjoint Datasets}, author = {Xiaolan Wang and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol12/p779-wang.pdf}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {7}, pages = {779--792}, abstract = {Data plays an important role in applications, analytic processes, and many aspects of human activity. As data grows in size and complexity, we are met with an imperative need for tools that promote understanding and explanations over data-related operations. Data management research on explanations has focused on the assumption that data resides in a single dataset, under one common schema. But the reality of today's data is that it is frequently unintegrated, coming from different sources with different schemas. When different datasets provide different answers to semantically similar questions, understanding the reasons for the discrepancies is challenging and cannot be handled by the existing single-dataset solutions. In this paper, we propose explain3D, a framework for explaining the disagreements across disjoint datasets (3D). Explain3D focuses on identifying the reasons for the differences in the results of two semantically similar queries operating on two datasets with potentially different schemas. Our framework leverages the queries to perform a semantic mapping across the relevant parts of their provenance; discrepancies in this mapping point to causes of the queries' differences. Exploiting the queries gives explain3D an edge over traditional schema matching and record linkage techniques, which are query-agnostic. Our work makes the following contributions: (1) We formalize the problem of deriving optimal explanations for the differences of the results of semantically similar queries over disjoint datasets. Our optimization problem considers two types of explanations, provenance-based and value-based, defined over an evidence mapping, which makes our solution interpretable. (2) We design a 3-stage framework for solving the optimal explanation problem. (3) We develop a smart-partitioning optimizer that improves the efficiency of the framework by orders of magnitude. (4) We experiment with real-world and synthetic data to demonstrate that explain3D can derive precise explanations efficiently, and is superior to alternative methods based on integration techniques and single-dataset explanation frameworks.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Data plays an important role in applications, analytic processes, and many aspects of human activity. As data grows in size and complexity, we are met with an imperative need for tools that promote understanding and explanations over data-related operations. Data management research on explanations has focused on the assumption that data resides in a single dataset, under one common schema. But the reality of today's data is that it is frequently unintegrated, coming from different sources with different schemas. When different datasets provide different answers to semantically similar questions, understanding the reasons for the discrepancies is challenging and cannot be handled by the existing single-dataset solutions. In this paper, we propose explain3D, a framework for explaining the disagreements across disjoint datasets (3D). Explain3D focuses on identifying the reasons for the differences in the results of two semantically similar queries operating on two datasets with potentially different schemas. Our framework leverages the queries to perform a semantic mapping across the relevant parts of their provenance; discrepancies in this mapping point to causes of the queries' differences. Exploiting the queries gives explain3D an edge over traditional schema matching and record linkage techniques, which are query-agnostic. Our work makes the following contributions: (1) We formalize the problem of deriving optimal explanations for the differences of the results of semantically similar queries over disjoint datasets. Our optimization problem considers two types of explanations, provenance-based and value-based, defined over an evidence mapping, which makes our solution interpretable. (2) We design a 3-stage framework for solving the optimal explanation problem. (3) We develop a smart-partitioning optimizer that improves the efficiency of the framework by orders of magnitude. (4) We experiment with real-world and synthetic data to demonstrate that explain3D can derive precise explanations efficiently, and is superior to alternative methods based on integration techniques and single-dataset explanation frameworks. |
Ryan Mckenna, Daniel Sheldon, Gerome Miklau Graphical-model based estimation and inference for differential privacy Inproceedings International Conference on Machine Learning, pp. 4435–4444, 2019. @inproceedings{mckenna2019graphical, title = {Graphical-model based estimation and inference for differential privacy}, author = {Ryan Mckenna and Daniel Sheldon and Gerome Miklau}, url = {http://proceedings.mlr.press/v97/mckenna19a/mckenna19a.pdf https://people.cs.umass.edu/~miklau/assets/pubs/dp/mckenna19pgm.pdf }, year = {2019}, date = {2019-01-01}, booktitle = {International Conference on Machine Learning}, pages = {4435--4444}, abstract = {Many privacy mechanisms reveal high-level information about a data distribution through noisy measurements. It is common to use this information to estimate the answers to new queries. In this work, we provide an approach to solve this estimation problem efficiently using graphical models, which is particularly effective when the distribution is high-dimensional but the measurements are over low-dimensional marginals. We show that our approach is far more efficient than existing estimation techniques from the privacy literature and that it can improve the accuracy and scalability of many state-of-the-art mechanisms.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Many privacy mechanisms reveal high-level information about a data distribution through noisy measurements. It is common to use this information to estimate the answers to new queries. In this work, we provide an approach to solve this estimation problem efficiently using graphical models, which is particularly effective when the distribution is high-dimensional but the measurements are over low-dimensional marginals. We show that our approach is far more efficient than existing estimation techniques from the privacy literature and that it can improve the accuracy and scalability of many state-of-the-art mechanisms. |
Ahmed Elgohary, Matthias Boehm, Peter J Haas, Frederick R Reiss, Berthold Reinwald Compressed linear algebra for declarative large-scale machine learning Journal Article Commun. ACM, 62 (5), pp. 83–91, 2019. @article{DBLP:journals/cacm/ElgoharyBHRR19, title = {Compressed linear algebra for declarative large-scale machine learning}, author = {Ahmed Elgohary and Matthias Boehm and Peter J Haas and Frederick R Reiss and Berthold Reinwald}, url = {https://doi.org/10.1145/3318221}, doi = {10.1145/3318221}, year = {2019}, date = {2019-01-01}, journal = {Commun. ACM}, volume = {62}, number = {5}, pages = {83--91}, abstract = {Large-scale Machine Learning (ML) algorithms are often iterative, using repeated read-only data access and I/Obound matrix-vector multiplications. Hence, it is crucial for performance to fit the data into single-node or distributed main memory to enable fast matrix-vector operations. General-purpose compression struggles to achieve both good compression ratios and fast decompression for blockwise uncompressed operations. Therefore, we introduce Compressed Linear Algebra (CLA) for lossless matrix compression. CLA encodes matrices with lightweight, valuebased compression techniques and executes linear algebra operations directly on the compressed representations. We contribute effective column compression schemes, cacheconscious operations, and an efficient sampling-based compression algorithm. Our experiments show good compression ratios and operations performance close to the uncompressed case, which enables fitting larger datasets into available memory. We thereby obtain significant endto-end performance improvements}, keywords = {}, pubstate = {published}, tppubtype = {article} } Large-scale Machine Learning (ML) algorithms are often iterative, using repeated read-only data access and I/Obound matrix-vector multiplications. Hence, it is crucial for performance to fit the data into single-node or distributed main memory to enable fast matrix-vector operations. General-purpose compression struggles to achieve both good compression ratios and fast decompression for blockwise uncompressed operations. Therefore, we introduce Compressed Linear Algebra (CLA) for lossless matrix compression. CLA encodes matrices with lightweight, valuebased compression techniques and executes linear algebra operations directly on the compressed representations. We contribute effective column compression schemes, cacheconscious operations, and an efficient sampling-based compression algorithm. Our experiments show good compression ratios and operations performance close to the uncompressed case, which enables fitting larger datasets into available memory. We thereby obtain significant endto-end performance improvements |
Yanlei Diao, Pawel Guzewicz, Ioana Manolescu, Mirjana Mazuran Spade: A Modular Framework for Analytical Exploration of RDF Graphs Journal Article PVLDB, 12 (12), pp. 1926–1929, 2019. @article{DBLP:journals/pvldb/DiaoGMM19, title = {Spade: A Modular Framework for Analytical Exploration of RDF Graphs}, author = {Yanlei Diao and Pawel Guzewicz and Ioana Manolescu and Mirjana Mazuran}, url = {http://www.vldb.org/pvldb/vol12/p1926-diao.pdf https://hal.inria.fr/hal-02152844/document}, doi = {10.14778/3352063.3352101}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {12}, pages = {1926--1929}, abstract = {RDF data is complex; exploring it is hard, and can be done through many different metaphors. We have developed and propose to demonstrate Spade, a tool helping users discover meaningful content of an RDF graph by showing them the results of aggregation (OLAP-style) queries automatically identified from the data. Spade chooses aggregates that are visually interesting, a property formally based on statistic properties of the aggregation query results. While well understood for relational data, such exploration raises multiple challenges for RDF: facts, dimensions and measures have to be identified (as opposed to known beforehand); as there are more candidate aggregates, assessing their interestingness can be very costly; finally, ontologies bring novel specific challenges but also novel opportunities, enabling ontology-driven exploration from an aggregate initially proposed by the system. Spade is a generic, extensible framework, which we instantiated with: (i) novel methods for enumerating candidate measures and dimensions in the vast space of possibilities provided by an RDF graph; (ii) a set of aggregate interestingness functions; (iii) ontology-based interactive exploration, and (iv) efficient early-stop techniques for estimating the interestingness of an aggregate query. The demonstration will comprise interactive scenarios on a variety of large, interesting RDF graphs.}, keywords = {}, pubstate = {published}, tppubtype = {article} } RDF data is complex; exploring it is hard, and can be done through many different metaphors. We have developed and propose to demonstrate Spade, a tool helping users discover meaningful content of an RDF graph by showing them the results of aggregation (OLAP-style) queries automatically identified from the data. Spade chooses aggregates that are visually interesting, a property formally based on statistic properties of the aggregation query results. While well understood for relational data, such exploration raises multiple challenges for RDF: facts, dimensions and measures have to be identified (as opposed to known beforehand); as there are more candidate aggregates, assessing their interestingness can be very costly; finally, ontologies bring novel specific challenges but also novel opportunities, enabling ontology-driven exploration from an aggregate initially proposed by the system. Spade is a generic, extensible framework, which we instantiated with: (i) novel methods for enumerating candidate measures and dimensions in the vast space of possibilities provided by an RDF graph; (ii) a set of aggregate interestingness functions; (iii) ontology-based interactive exploration, and (iv) efficient early-stop techniques for estimating the interestingness of an aggregate query. The demonstration will comprise interactive scenarios on a variety of large, interesting RDF graphs. |
Khaled Zaouk, Fei Song, Chenghao Lyu, Arnab Sinha, Yanlei Diao, Prashant J Shenoy UDAO: A Next-Generation Unified Data Analytics Optimizer Journal Article PVLDB, 12 (12), pp. 1934–1937, 2019. @article{DBLP:journals/pvldb/ZaoukSLSDS19, title = {UDAO: A Next-Generation Unified Data Analytics Optimizer}, author = {Khaled Zaouk and Fei Song and Chenghao Lyu and Arnab Sinha and Yanlei Diao and Prashant J Shenoy}, url = {http://www.vldb.org/pvldb/vol12/p1934-zaouk.pdf}, doi = {10.14778/3352063.3352103}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {12}, pages = {1934--1937}, abstract = {Big data analytics systems today still lack the ability to take user performance goals and budgetary constraints, collectively referred to as “objectives”, and automatically configure an analytic job to achieve the objectives. This paper presents UDAO, a unified data analytics optimizer that can automatically determine the parameters of the runtime system, collectively called a job configuration, for general dataflow programs based on user objectives. UDAO embodies key techniques including in-situ modeling, which learns a model for each user objective in the same computing environment as the job is run, and multi-objective optimization, which computes a Pareto optimal set of job configurations to reveal tradeoffs between different objectives. Using benchmarks developed based on industry needs, our demonstration will allow the user to explore (1) learned models to gain insights into how various parameters affect user objectives; (2) Pareto frontiers to understand interesting tradeoffs between different objectives and how a configuration recommended by the optimizer explores these tradeoffs; (3) endto-end benefits that UDAO can provide over default configurations or those manually tuned by engineers.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Big data analytics systems today still lack the ability to take user performance goals and budgetary constraints, collectively referred to as “objectives”, and automatically configure an analytic job to achieve the objectives. This paper presents UDAO, a unified data analytics optimizer that can automatically determine the parameters of the runtime system, collectively called a job configuration, for general dataflow programs based on user objectives. UDAO embodies key techniques including in-situ modeling, which learns a model for each user objective in the same computing environment as the job is run, and multi-objective optimization, which computes a Pareto optimal set of job configurations to reveal tradeoffs between different objectives. Using benchmarks developed based on industry needs, our demonstration will allow the user to explore (1) learned models to gain insights into how various parameters affect user objectives; (2) Pareto frontiers to understand interesting tradeoffs between different objectives and how a configuration recommended by the optimizer explores these tradeoffs; (3) endto-end benefits that UDAO can provide over default configurations or those manually tuned by engineers. |
Brian Hentschel, Peter J Haas, Yuanyuan Tian Online Model Management via Temporally Biased Sampling Journal Article SIGMOD Record, 48 (1), pp. 69–76, 2019. @article{DBLP:journals/sigmod/HentschelHT19, title = {Online Model Management via Temporally Biased Sampling}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {https://doi.org/10.1145/3371316.3371333 https://people.cs.umass.edu/~phaas/files/tbs-sigmod-record.pdf}, doi = {10.1145/3371316.3371333}, year = {2019}, date = {2019-01-01}, journal = {SIGMOD Record}, volume = {48}, number = {1}, pages = {69--76}, abstract = {To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally-biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying exponentially over time. We then periodically retrain the models on the current sample. We provide and analyze both a simple sampling scheme (T-TBS) that probabilistically maintains a target sample size and a novel reservoir-based scheme (R-TBS) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark show that our approach can increase machine learning accuracy and robustness in the face of evolving data.}, keywords = {}, pubstate = {published}, tppubtype = {article} } To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally-biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying exponentially over time. We then periodically retrain the models on the current sample. We provide and analyze both a simple sampling scheme (T-TBS) that probabilistically maintains a target sample size and a novel reservoir-based scheme (R-TBS) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark show that our approach can increase machine learning accuracy and robustness in the face of evolving data. |
Johanna Sommer, Matthias Boehm, Alexandre V Evfimievski, Berthold Reinwald, Peter J Haas MNC: Structure-Exploiting Sparsity Estimation for Matrix Expressions Inproceedings Proceedings of the 2019 International Conference on Management of Data, SIGMOD Conference 2019, Amsterdam, The Netherlands, June 30 - July 5, 2019, pp. 1607–1623, 2019. @inproceedings{DBLP:conf/sigmod/Sommer0ERH19, title = {MNC: Structure-Exploiting Sparsity Estimation for Matrix Expressions}, author = {Johanna Sommer and Matthias Boehm and Alexandre V Evfimievski and Berthold Reinwald and Peter J Haas}, url = {https://doi.org/10.1145/3299869.3319854 https://mboehm7.github.io/resources/sigmod2019.pdf}, doi = {10.1145/3299869.3319854}, year = {2019}, date = {2019-01-01}, booktitle = {Proceedings of the 2019 International Conference on Management of Data, SIGMOD Conference 2019, Amsterdam, The Netherlands, June 30 - July 5, 2019}, pages = {1607--1623}, crossref = {DBLP:conf/sigmod/2019}, abstract = {Efficiently computing linear algebra expressions is central to machine learning (ML) systems. Most systems support sparse formats and operations because sparse matrices are ubiquitous and their dense representation can cause prohibitive overheads. Estimating the sparsity of intermediates, however, remains a key challenge when generating execution plans or performing sparse operations. These sparsity estimates are used for cost and memory estimates, format decisions, and result allocation. Existing estimators tend to focus on matrix products only, and struggle to attain good accuracy with low estimation overhead. However, a key observation is that real-world sparse matrices commonly exhibit structural properties such as a single non-zero per row, or columns with varying sparsity. In this paper, we introduce MNC (Matrix Non-zero Count), a remarkably simple, count-based matrix synopsis that exploits these structural properties for efficient, accurate, and general sparsity estimation. We describe estimators and sketch propagation for realistic linear algebra expressions. Our experiments—on a new estimation benchmark called SparsEst—show that the MNC estimator yields good accuracy with very low overhead. This behavior makes MNC practical and broadly applicable in ML systems.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Efficiently computing linear algebra expressions is central to machine learning (ML) systems. Most systems support sparse formats and operations because sparse matrices are ubiquitous and their dense representation can cause prohibitive overheads. Estimating the sparsity of intermediates, however, remains a key challenge when generating execution plans or performing sparse operations. These sparsity estimates are used for cost and memory estimates, format decisions, and result allocation. Existing estimators tend to focus on matrix products only, and struggle to attain good accuracy with low estimation overhead. However, a key observation is that real-world sparse matrices commonly exhibit structural properties such as a single non-zero per row, or columns with varying sparsity. In this paper, we introduce MNC (Matrix Non-zero Count), a remarkably simple, count-based matrix synopsis that exploits these structural properties for efficient, accurate, and general sparsity estimation. We describe estimators and sketch propagation for realistic linear algebra expressions. Our experiments—on a new estimation benchmark called SparsEst—show that the MNC estimator yields good accuracy with very low overhead. This behavior makes MNC practical and broadly applicable in ML systems. |
Zhiqi Huang, Ryan McKenna, George Bissias, Gerome Miklau, Michael Hay, Ashwin Machanavajjhala PSynDB: Accurate and Accessible Private Data Generation Journal Article PVLDB, 12 (12), pp. 1918–1921, 2019. @article{DBLP:journals/pvldb/HuangMBMHM19, title = {PSynDB: Accurate and Accessible Private Data Generation}, author = {Zhiqi Huang and Ryan McKenna and George Bissias and Gerome Miklau and Michael Hay and Ashwin Machanavajjhala}, url = {http://www.vldb.org/pvldb/vol12/p1918-huang.pdf}, doi = {10.14778/3352063.3352099}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {12}, pages = {1918--1921}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Ios Kotsogiannis, Yuchao Tao, Xi He, Maryam Fanaeepour, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau PrivateSQL: A Differentially Private SQL Query Engine Journal Article PVLDB, 12 (11), pp. 1371–1384, 2019. @article{DBLP:journals/pvldb/KotsogiannisTHF19, title = {PrivateSQL: A Differentially Private SQL Query Engine}, author = {Ios Kotsogiannis and Yuchao Tao and Xi He and Maryam Fanaeepour and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau}, url = {http://www.vldb.org/pvldb/vol12/p1371-kotsogiannis.pdf}, doi = {10.14778/3342263.3342274}, year = {2019}, date = {2019-01-01}, journal = {PVLDB}, volume = {12}, number = {11}, pages = {1371--1384}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Dan Zhang, Ryan McKenna, Ios Kotsogiannis, George Bissias, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau Ektelo: A Framework for Defining Differentially-Private Computations Journal Article SIGMOD Record, 48 (1), pp. 15–22, 2019. @article{DBLP:journals/sigmod/ZhangMKBHMM19, title = {Ektelo: A Framework for Defining Differentially-Private Computations}, author = {Dan Zhang and Ryan McKenna and Ios Kotsogiannis and George Bissias and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3371316.3371321}, doi = {10.1145/3371316.3371321}, year = {2019}, date = {2019-01-01}, journal = {SIGMOD Record}, volume = {48}, number = {1}, pages = {15--22}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Brian Hentschel, Peter J Haas, Yuanyuan Tian General Temporally Biased Sampling Schemes for Online Model Management Journal Article ACM Trans. Database Syst., 44 (4), pp. 14:1–14:45, 2019. @article{DBLP:journals/tods/HentschelHT19, title = {General Temporally Biased Sampling Schemes for Online Model Management}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {https://doi.org/10.1145/3360903}, doi = {10.1145/3360903}, year = {2019}, date = {2019-12-24}, journal = {ACM Trans. Database Syst.}, volume = {44}, number = {4}, pages = {14:1--14:45}, abstract = {To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying over time according to a specified “decay function.” We then periodically retrain the models on the current sample. This approach speeds up the training process relative to training on all of the data. Moreover, time-biasing lets the models adapt to recent changes in the data while—unlike in a sliding-window approach—still keeping some old data to ensure robustness in the face of temporary fluctuations and periodicities in the data values. In addition, the sampling-based approach allows existing analytic algorithms for static data to be applied to dynamic streaming data essentially without change. We provide and analyze both a simple sampling scheme (Targeted-Size Time-Biased Sampling (T-TBS)) that probabilistically maintains a target sample size and a novel reservoir-based scheme (Reservoir-Based Time-Biased Sampling (R-TBS)) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. If the decay function is exponential, then control over the decay rate is complete, and R-TBS maximizes both expected sample size and sample-size stability. For general decay functions, the actual item inclusion probabilities can be made arbitrarily close to the nominal probabilities, and we provide a scheme that allows a tradeoff between sample footprint and sample-size stability. R-TBS rests on the notion of a “fractional sample” and allows for data arrival rates that are unknown and time varying (unlike T-TBS). The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark illuminate the performance and scalability of the algorithms, and show that our approach can increase machine learning robustness in the face of evolving data.}, keywords = {}, pubstate = {published}, tppubtype = {article} } To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying over time according to a specified “decay function.” We then periodically retrain the models on the current sample. This approach speeds up the training process relative to training on all of the data. Moreover, time-biasing lets the models adapt to recent changes in the data while—unlike in a sliding-window approach—still keeping some old data to ensure robustness in the face of temporary fluctuations and periodicities in the data values. In addition, the sampling-based approach allows existing analytic algorithms for static data to be applied to dynamic streaming data essentially without change. We provide and analyze both a simple sampling scheme (Targeted-Size Time-Biased Sampling (T-TBS)) that probabilistically maintains a target sample size and a novel reservoir-based scheme (Reservoir-Based Time-Biased Sampling (R-TBS)) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. If the decay function is exponential, then control over the decay rate is complete, and R-TBS maximizes both expected sample size and sample-size stability. For general decay functions, the actual item inclusion probabilities can be made arbitrarily close to the nominal probabilities, and we provide a scheme that allows a tradeoff between sample footprint and sample-size stability. R-TBS rests on the notion of a “fractional sample” and allows for data arrival rates that are unknown and time varying (unlike T-TBS). The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark illuminate the performance and scalability of the algorithms, and show that our approach can increase machine learning robustness in the face of evolving data. |
Ios Kotsogiannis, Yuchao Tao, Ashwin Machanavajjhala, Gerome Miklau, Michael Hay Architecting a Differentially Private SQL Engine Inproceedings CIDR 2019, 9th Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings, 2019. @inproceedings{DBLP:conf/cidr/KotsogiannisTMM19, title = {Architecting a Differentially Private SQL Engine}, author = {Ios Kotsogiannis and Yuchao Tao and Ashwin Machanavajjhala and Gerome Miklau and Michael Hay}, url = {http://cidrdb.org/cidr2019/papers/p125-kotsogiannis-cidr19.pdf}, year = {2019}, date = {2019-01-01}, booktitle = {CIDR 2019, 9th Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2023 |
GraphMini: Accelerating Graph Pattern Matching Using Auxiliary Graphs Journal Article International Conference on Parallel Architectures and Compilation Techniques, 2023. |
Piloting an Interactive Ethics and Responsible Computing Learning Environment in Undergraduate CS Courses Journal Article Proceedings of the 54th ACM Technical Symposium on Computer Science Education, 2023. |
NIM: Generative Neural Networks for Automated Modeling and Generation of Simulation Inputs Journal Article ACM Transactions on Modeling and Computer Simulation, 33 (3), pp. 1–26, 2023. |
Infectious Disease Modelling, 8 (1), pp. 84-100, 2023. |
Exact PPS Sampling with Bounded Sample Size Journal Article Information Processing Letters, 182 , 2023. |
2022 |
AIM: an adaptive and iterative mechanism for differentially private synthetic data Inproceedings Proceedings of the VLDB Endowment, pp. 2599–2612, 2022. |
Transitioning from testbeds to ships: an experience study in deploying the TIPPERS Internet of Things platform to the US Navy Journal Article The Journal of Defense Modeling and Simulation, 19 (3), pp. 501-517, 2022. |
DataPrism: Exposing Disconnect between Data and Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022, pp. 217-231, 2022. |
Through the data management lens: Experimental analysis and evaluation of fair classification Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2022., pp. 232-246, 2022. |
Improved approximation and scalability for fair max-min diversification Inproceedings ICDT 2022, 2022. |
Enhanced Simulation Metamodeling via Graph and Generative Neural Networks Inproceedings Winter Simulation Conference, WSC 2022, Singapore, December 11-14, 2022, pp. 2748-2759, 2022. |
Augmenting Decision Making via Interactive What-If Analysis Inproceedings 12th Conference on Innovative Data Systems Research, CIDR 2022, Chaminade, CA, USA, January 9-12, 2022, 2022. |
In-Database Decision Support: Opportunities and Challenges Journal Article IEEE Data Engineering Bulletin, 45 (3), pp. 102-115, 2022. |
2021 |
Winning the NIST Contest: A scalable and general approach to differentially private synthetic data Journal Article Journal of Privacy and Confidentiality, 11 (3), 2021. |
Relaxed marginal consistency for differentially private query answering Inproceedings Advances in Neural Information Processing Systems, pp. 20696-20707, 2021. |
FlexPushdownDB: Hybrid Pushdown and Caching in a Cloud DBMS Inproceedings Proceedings of the VLDB Endowment, pp. 2101–2113, 2021. |
Accelerating graph sampling for graph machine learning using GPUs Inproceedings EuroSys '21: Proceedings of the Sixteenth European Conference on Computer Systems, pp. 311–326, 2021. |
Scalable graph neural network training: The case for sampling Journal Article ACM SIGOPS Operating Systems Review, 55 (1), pp. 68-76, 2021. |
Spark-based Cloud Data Analytics using Multi-Objective Optimization Inproceedings 37th IEEE International Conference on Data Engineering, ICDE 2021, Chania, Greece, April 19-22, 2021, pp. 396–407, IEEE, 2021. |
Diverse Data Selection under Fairness Constraints Inproceedings International Conference on Database Theory, (ICDT), pp. 11:1–11:25, 2021. |
Conformance Constraint Discovery: Measuring Trust in Data-Driven Systems Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. |
CoCo: Interactive Exploration of Conformance Constraints for Data Understanding and Data Cleaning Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), 2021. |
2020 |
Do the Best Cloud Configurations Grow on Trees? An Experimental Evaluation of Black Box Algorithms for Optimizing Cloud Workloads Sub Journal Article Proc. VLDB Endow., 13 (11), pp. 2563–2575, 2020. |
sPaQLTooLs: A Stochastic Package Query Interface for Scalable Constrained Optimization Journal Article Proc. VLDB Endow., 13 (12), pp. 2881–2884, 2020. |
SuDocu: Summarizing Documents by Example Journal Article Proc. VLDB Endow., 13 (12), pp. 2861–2864, 2020. |
Sato: Contextual Semantic Type Detection in Tables Journal Article Proc. VLDB Endow., 13 (11), pp. 1835–1848, 2020. |
PushdownDB: Accelerating a DBMS Using S3 Computation Inproceedings 36th IEEE International Conference on Data Engineering, ICDE 2020, Dallas, TX, USA, April 20-24, 2020, pp. 1802–1805, 2020. |
LiveGraph: A Transactional Graph Storage System with Purely Sequential Adjacency List Scans Journal Article Proc. VLDB Endow., 13 (7), pp. 1020–1034, 2020. |
New Results for the Complexity of Resilience for Binary Conjunctive Queries with Self-Joins Inproceedings Proceedings of the 39th ACM SIGMOD-SIGACT-SIGAI Symposium on Principles of Database Systems, PODS 2020, Portland, OR, USA, June 14-19, 2020, pp. 271–284, 2020. |
ExTuNe: Explaining Tuple Non-conformance Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 2741–2744, 2020. |
Causality-Guided Adaptive Interventional Debugging Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 431–446, 2020. |
Stochastic Package Queries in Probabilistic Databases Inproceedings Proceedings of the 2020 International Conference on Management of Data, SIGMOD Conference 2020, online conference [Portland, OR, USA], June 14-19, 2020, pp. 269–283, 2020. |
Fair decision making using privacy-protected data Inproceedings FAT* '20: Conference on Fairness, Accountability, and Transparency, Barcelona, Spain, January 27-30, 2020, pp. 189–199, 2020. |
ϵKTELO: A Framework for Defining Differentially Private Computations Journal Article ACM Trans. Database Syst., 45 (1), pp. 2:1–2:44, 2020. |
2019 |
Scalable computation of high-order optimization queries Journal Article Commun. ACM, 62 (2), pp. 108–116, 2019. |
Choosing A Cloud DBMS: Architectures and Tradeoffs Journal Article Proceedings of the VLDB Endowment, 12 (12), 2019. |
Example-Driven Query Intent Discovery: Abductive Reasoning using Semantic Similarity Journal Article PVLDB, 12 (11), pp. 1262–1275, 2019. |
MIDAS: Finding the Right Web Sources to Fill Knowledge Gaps Inproceedings 35th IEEE International Conference on Data Engineering, ICDE 2019, Macao, China, April 8-11, 2019, pp. 578–589, 2019. |
Explain3D: Explaining Disagreements in Disjoint Datasets Journal Article PVLDB, 12 (7), pp. 779–792, 2019. |
Graphical-model based estimation and inference for differential privacy Inproceedings International Conference on Machine Learning, pp. 4435–4444, 2019. |
Compressed linear algebra for declarative large-scale machine learning Journal Article Commun. ACM, 62 (5), pp. 83–91, 2019. |
Spade: A Modular Framework for Analytical Exploration of RDF Graphs Journal Article PVLDB, 12 (12), pp. 1926–1929, 2019. |
UDAO: A Next-Generation Unified Data Analytics Optimizer Journal Article PVLDB, 12 (12), pp. 1934–1937, 2019. |
Online Model Management via Temporally Biased Sampling Journal Article SIGMOD Record, 48 (1), pp. 69–76, 2019. |
MNC: Structure-Exploiting Sparsity Estimation for Matrix Expressions Inproceedings Proceedings of the 2019 International Conference on Management of Data, SIGMOD Conference 2019, Amsterdam, The Netherlands, June 30 - July 5, 2019, pp. 1607–1623, 2019. |
PSynDB: Accurate and Accessible Private Data Generation Journal Article PVLDB, 12 (12), pp. 1918–1921, 2019. |
PrivateSQL: A Differentially Private SQL Query Engine Journal Article PVLDB, 12 (11), pp. 1371–1384, 2019. |
Ektelo: A Framework for Defining Differentially-Private Computations Journal Article SIGMOD Record, 48 (1), pp. 15–22, 2019. |
General Temporally Biased Sampling Schemes for Online Model Management Journal Article ACM Trans. Database Syst., 44 (4), pp. 14:1–14:45, 2019. |
Architecting a Differentially Private SQL Engine Inproceedings CIDR 2019, 9th Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings, 2019. |