2017 |
Michael Hay, Liudmila Elagina, Gerome Miklau Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017, pp. 669–677, 2017. @inproceedings{DBLP:conf/sdm/HayEM17b, title = {Differentially Private Rank Aggregation}, author = {Michael Hay and Liudmila Elagina and Gerome Miklau}, url = {https://doi.org/10.1137/1.9781611974973.75}, doi = {10.1137/1.9781611974973.75}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017}, pages = {669--677}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Yan Chen, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau PeGaSus: Data-Adaptive Differentially Private Stream Processing Inproceedings Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS 2017, Dallas, TX, USA, October 30 - November 03, 2017, pp. 1375–1388, 2017. @inproceedings{DBLP:conf/ccs/ChenMHM17, title = {PeGaSus: Data-Adaptive Differentially Private Stream Processing}, author = {Yan Chen and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau}, url = {https://doi.org/10.1145/3133956.3134102}, doi = {10.1145/3133956.3134102}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS 2017, Dallas, TX, USA, October 30 - November 03, 2017}, pages = {1375--1388}, crossref = {DBLP:conf/ccs/2017}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Chao Li, Daniel Yang Li, Gerome Miklau, Dan Suciu A theory of pricing private data Journal Article Commun. ACM, 60 (12), pp. 79–86, 2017. @article{DBLP:journals/cacm/LiLMS17, title = {A theory of pricing private data}, author = {Chao Li and Daniel Yang Li and Gerome Miklau and Dan Suciu}, url = {https://doi.org/10.1145/3139457}, doi = {10.1145/3139457}, year = {2017}, date = {2017-01-01}, journal = {Commun. ACM}, volume = {60}, number = {12}, pages = {79--86}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Azza Abouzied, Chris Blauvelt Redistributing Funds across Charitable Crowdfunding Campaigns Journal Article CoRR, abs/1706.00070 , 2017. @article{DBLP:journals/corr/BrucatoAB17, title = {Redistributing Funds across Charitable Crowdfunding Campaigns}, author = {Matteo Brucato and Azza Abouzied and Chris Blauvelt}, url = {http://arxiv.org/abs/1706.00070}, year = {2017}, date = {2017-01-01}, journal = {CoRR}, volume = {abs/1706.00070}, abstract = {On Kickstarter only 36% of crowdfunding campaigns successfully raise sufficient funds for their projects. In this paper, we explore the possibility of redistribution of crowdfunding donations to increase the chances of success. We define several intuitive redistribution policies and, using data from a real crowdfunding platform, LaunchGood, we assess the potential improvement in campaign fundraising success rates. We find that an aggressive redistribution scheme can boost campaign success rates from 37% to 79%, but such choice-agnostic redistribution schemes come at the cost of disregarding donor preferences. Taking inspiration from offline giving societies and donor clubs, we build a case for choice-preserving redistribution schemes that strike a balance between increasing the number of successful campaigns and respecting giving preference. We find that choice-preserving redistribution can easily achieve campaign success rates of 48%. Finally, we discuss the implications of these different redistribution schemes for the various stakeholders in the crowdfunding ecosystem.}, keywords = {}, pubstate = {published}, tppubtype = {article} } On Kickstarter only 36% of crowdfunding campaigns successfully raise sufficient funds for their projects. In this paper, we explore the possibility of redistribution of crowdfunding donations to increase the chances of success. We define several intuitive redistribution policies and, using data from a real crowdfunding platform, LaunchGood, we assess the potential improvement in campaign fundraising success rates. We find that an aggressive redistribution scheme can boost campaign success rates from 37% to 79%, but such choice-agnostic redistribution schemes come at the cost of disregarding donor preferences. Taking inspiration from offline giving societies and donor clubs, we build a case for choice-preserving redistribution schemes that strike a balance between increasing the number of successful campaigns and respecting giving preference. We find that choice-preserving redistribution can easily achieve campaign success rates of 48%. Finally, we discuss the implications of these different redistribution schemes for the various stakeholders in the crowdfunding ecosystem. |
Ahmed Elgohary, Matthias Boehm, Peter J Haas, Frederick R Reiss, Berthold Reinwald Compressed linear algebra for large-scale machine learning Journal Article The VLDB Journal, 2017, ISSN: 0949-877X. @article{Elgohary2017, title = {Compressed linear algebra for large-scale machine learning}, author = {Ahmed Elgohary and Matthias Boehm and Peter J Haas and Frederick R Reiss and Berthold Reinwald}, url = {http://www.vldb.org/pvldb/vol9/p960-elgohary.pdf https://doi.org/10.1007/s00778-017-0478-1}, doi = {10.1007/s00778-017-0478-1}, issn = {0949-877X}, year = {2017}, date = {2017-09-12}, journal = {The VLDB Journal}, abstract = {Large-scale machine learning algorithms are often iterative, using repeated read-only data access and I/O-bound matrix-vector multiplications to converge to an optimal model. It is crucial for performance to fit the data into single-node or distributed main memory and enable fast matrix-vector operations on in-memory data. General-purpose, heavy- and lightweight compression techniques struggle to achieve both good compression ratios and fast decompression speed to enable block-wise uncompressed operations. Therefore, we initiate work---inspired by database compression and sparse matrix formats---on value-based compressed linear algebra (CLA), in which heterogeneous, lightweight database compression techniques are applied to matrices, and then linear algebra operations such as matrix-vector multiplication are executed directly on the compressed representation. We contribute effective column compression schemes, cache-conscious operations, and an efficient sampling-based compression algorithm. Our experiments show that CLA achieves in-memory operations performance close to the uncompressed case and good compression ratios, which enables fitting substantially larger datasets into available memory. We thereby obtain significant end-to-end performance improvements up to 9.2x.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Large-scale machine learning algorithms are often iterative, using repeated read-only data access and I/O-bound matrix-vector multiplications to converge to an optimal model. It is crucial for performance to fit the data into single-node or distributed main memory and enable fast matrix-vector operations on in-memory data. General-purpose, heavy- and lightweight compression techniques struggle to achieve both good compression ratios and fast decompression speed to enable block-wise uncompressed operations. Therefore, we initiate work---inspired by database compression and sparse matrix formats---on value-based compressed linear algebra (CLA), in which heterogeneous, lightweight database compression techniques are applied to matrices, and then linear algebra operations such as matrix-vector multiplication are executed directly on the compressed representation. We contribute effective column compression schemes, cache-conscious operations, and an efficient sampling-based compression algorithm. Our experiments show that CLA achieves in-memory operations performance close to the uncompressed case and good compression ratios, which enables fitting substantially larger datasets into available memory. We thereby obtain significant end-to-end performance improvements up to 9.2x. |
Abhishek Roy, Yanlei Diao, Uday Evani, Avinash Abhyankar, Clinton Howarth, Rémi Le Priol, Toby Bloom Massively Parallel Processing of Whole Genome Sequence Data: An In-Depth Performance Study Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 187–202, 2017. @inproceedings{DBLP:conf/sigmod/RoyDEAHPB17, title = {Massively Parallel Processing of Whole Genome Sequence Data: An In-Depth Performance Study}, author = {Abhishek Roy and Yanlei Diao and Uday Evani and Avinash Abhyankar and Clinton Howarth and R{é}mi Le Priol and Toby Bloom}, url = {http://doi.acm.org/10.1145/3035918.3064048}, doi = {10.1145/3035918.3064048}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017}, pages = {187--202}, crossref = {DBLP:conf/sigmod/2017}, abstract = {This paper presents a joint effort between a group of computer scientists and bioinformaticians to take an important step towards a general big data platform for genome analysis pipelines. The key goals of this study are to develop a thorough understanding of the strengths and limitations of big data technology for genomic data analysis, and to identify the key questions that the research community could address to realize the vision of personalized genomic medicine. Our platform, called Gesall, is based on the new "Wrapper Technology" that supports existing genomic data analysis programs in their native forms, without having to rewrite them. To do so, our system provides several layers of software, including a new Genome Data Parallel Toolkit (GDPT), which can be used to "wrap" existing data analysis programs. This platform offers a concrete context for evaluating big data technology for genomics: we report on super-linear speedup and sublinear speedup for various tasks, as well as the reasons why a parallel program could produce different results from those of a serial program. These results lead to key research questions that require a synergy between genomics scientists and computer scientists to find solutions.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper presents a joint effort between a group of computer scientists and bioinformaticians to take an important step towards a general big data platform for genome analysis pipelines. The key goals of this study are to develop a thorough understanding of the strengths and limitations of big data technology for genomic data analysis, and to identify the key questions that the research community could address to realize the vision of personalized genomic medicine. Our platform, called Gesall, is based on the new "Wrapper Technology" that supports existing genomic data analysis programs in their native forms, without having to rewrite them. To do so, our system provides several layers of software, including a new Genome Data Parallel Toolkit (GDPT), which can be used to "wrap" existing data analysis programs. This platform offers a concrete context for evaluating big data technology for genomics: we report on super-linear speedup and sublinear speedup for various tasks, as well as the reasons why a parallel program could produce different results from those of a serial program. These results lead to key research questions that require a synergy between genomics scientists and computer scientists to find solutions. |
Garrett Bernstein, Ryan McKenna, Tao Sun, Daniel Sheldon, Michael Hay, Gerome Miklau Differentially Private Learning of Undirected Graphical Models using Collective Graphical Models Journal Article CoRR, abs/1706.04646 , 2017. @article{DBLP:journals/corr/BernsteinMSSHM17, title = {Differentially Private Learning of Undirected Graphical Models using Collective Graphical Models}, author = {Garrett Bernstein and Ryan McKenna and Tao Sun and Daniel Sheldon and Michael Hay and Gerome Miklau}, url = {http://arxiv.org/abs/1706.04646}, year = {2017}, date = {2017-01-01}, journal = {CoRR}, volume = {abs/1706.04646}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Julia Stoyanovich, Bill Howe, Serge Abiteboul, Gerome Miklau, Arnaud Sahuguet, Gerhard Weikum Fides: Towards a Platform for Responsible Data Science Inproceedings Proceedings of the 29th International Conference on Scientific and Statistical Database Management, Chicago, IL, USA, June 27-29, 2017, pp. 26:1–26:6, 2017. @inproceedings{DBLP:conf/ssdbm/StoyanovichHAMS17, title = {Fides: Towards a Platform for Responsible Data Science}, author = {Julia Stoyanovich and Bill Howe and Serge Abiteboul and Gerome Miklau and Arnaud Sahuguet and Gerhard Weikum}, url = {http://doi.acm.org/10.1145/3085504.3085530}, doi = {10.1145/3085504.3085530}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 29th International Conference on Scientific and Statistical Database Management, Chicago, IL, USA, June 27-29, 2017}, pages = {26:1--26:6}, crossref = {DBLP:conf/ssdbm/2017}, abstract = {Issues of responsible data analysis and use are coming to the forefront of the discourse in data science research and practice, with most significant efforts to date on the part of the data mining, machine learning, and security and privacy communities. In these fields, the research has been focused on analyzing the fairness, accountability and transparency (FAT) properties of specific algorithms and their outputs. Although these issues are most apparent in the social sciences where fairness is interpreted in terms of the distribution of resources across protected groups, management of bias in source data affects a variety of fields. Consider climate change studies that require representative data from geographically diverse regions, or supply chain analyses that require data that represents the diversity of products and customers. Any domain that involves sparse or sampled data has exposure to potential bias. In this vision paper, we argue that FAT properties must be considered as database system issues, further upstream in the data science lifecycle: bias in source data goes unnoticed, and bias may be introduced during pre-processing (fairness), spurious correlations lead to reproducibility problems (accountability), and assumptions made during pre-processing have invisible but significant effects on decisions (transparency). As machine learning methods continue to be applied broadly by non-experts, the potential for misuse increases. We see a need for a data sharing and collaborative analytics platform with features to encourage (and in some cases, enforce) best practices at all stages of the data science lifecycle. We describe features of such a platform, which we term Fides, in the context of urban analytics, outlining a systems research agenda in responsible data science.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Issues of responsible data analysis and use are coming to the forefront of the discourse in data science research and practice, with most significant efforts to date on the part of the data mining, machine learning, and security and privacy communities. In these fields, the research has been focused on analyzing the fairness, accountability and transparency (FAT) properties of specific algorithms and their outputs. Although these issues are most apparent in the social sciences where fairness is interpreted in terms of the distribution of resources across protected groups, management of bias in source data affects a variety of fields. Consider climate change studies that require representative data from geographically diverse regions, or supply chain analyses that require data that represents the diversity of products and customers. Any domain that involves sparse or sampled data has exposure to potential bias. In this vision paper, we argue that FAT properties must be considered as database system issues, further upstream in the data science lifecycle: bias in source data goes unnoticed, and bias may be introduced during pre-processing (fairness), spurious correlations lead to reproducibility problems (accountability), and assumptions made during pre-processing have invisible but significant effects on decisions (transparency). As machine learning methods continue to be applied broadly by non-experts, the potential for misuse increases. We see a need for a data sharing and collaborative analytics platform with features to encourage (and in some cases, enforce) best practices at all stages of the data science lifecycle. We describe features of such a platform, which we term Fides, in the context of urban analytics, outlining a systems research agenda in responsible data science. |
Ios Kotsogiannis, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Margaret Orr DIAS: Differentially Private Interactive Algorithm Selection using Pythia Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1679–1682, 2017. @inproceedings{DBLP:conf/sigmod/KotsogiannisHMM17, title = {DIAS: Differentially Private Interactive Algorithm Selection using Pythia}, author = {Ios Kotsogiannis and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Margaret Orr}, url = {http://doi.acm.org/10.1145/3035918.3056441}, doi = {10.1145/3035918.3056441}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017}, pages = {1679--1682}, crossref = {DBLP:conf/sigmod/2017}, abstract = {Differential privacy has emerged as the dominant privacy standard for data analysis. Its wide acceptance has led to significant development of algorithms that meet this rigorous standard. For some tasks, such as the task of answering low dimensional counting queries, dozens of algorithms have been proposed. However, no single algorithm has emerged as the dominant performer, and in fact, algorithm performance varies drastically across inputs. Thus, it's not clear how to select an algorithm for a particular task, and choosing the wrong algorithm might lead to significant degradation in terms of analysis accuracy. We believe that the difficulty of algorithm selection is one factor limiting the adoption of differential privacy in real systems. In this demonstration we present DIAS (Differentially-private Interactive Algorithm Selection), an educational privacy game. Users are asked to perform algorithm selection for a variety of inputs and compare the performance of their choices against that of Pythia, an automated algorithm selection framework. Our hope is that by the end of the game users will understand the importance of algorithm selection and most importantly will have a good grasp on how to use differentially private algorithms for their own applications.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Differential privacy has emerged as the dominant privacy standard for data analysis. Its wide acceptance has led to significant development of algorithms that meet this rigorous standard. For some tasks, such as the task of answering low dimensional counting queries, dozens of algorithms have been proposed. However, no single algorithm has emerged as the dominant performer, and in fact, algorithm performance varies drastically across inputs. Thus, it's not clear how to select an algorithm for a particular task, and choosing the wrong algorithm might lead to significant degradation in terms of analysis accuracy. We believe that the difficulty of algorithm selection is one factor limiting the adoption of differential privacy in real systems. In this demonstration we present DIAS (Differentially-private Interactive Algorithm Selection), an educational privacy game. Users are asked to perform algorithm selection for a variety of inputs and compare the performance of their choices against that of Pythia, an automated algorithm selection framework. Our hope is that by the end of the game users will understand the importance of algorithm selection and most importantly will have a good grasp on how to use differentially private algorithms for their own applications. |
Ios Kotsogiannis, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau Pythia: Data Dependent Differentially Private Algorithm Selection Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1323–1337, 2017. @inproceedings{DBLP:conf/sigmod/KotsogiannisMHM17, title = {Pythia: Data Dependent Differentially Private Algorithm Selection}, author = {Ios Kotsogiannis and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau}, url = {http://doi.acm.org/10.1145/3035918.3035945}, doi = {10.1145/3035918.3035945}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017}, pages = {1323--1337}, crossref = {DBLP:conf/sigmod/2017}, abstract = {Differential privacy has emerged as a preferred standard for ensuring privacy in analysis tasks on sensitive datasets. Recent algorithms have allowed for significantly lower error by adapting to properties of the input data. These so-called data-dependent algorithms have different error rates for different inputs. There is now a complex and growing landscape of algorithms without a clear winner that can offer low error over all datasets. As a result, the best possible error rates are not attainable in practice, because the data curator cannot know which algorithm to select prior to actually running the algorithm. We address this challenge by proposing a novel meta-algorithm designed to relieve the data curator of the burden of algorithm selection. It works by learning (from non-sensitive data) the association between dataset properties and the best-performing algorithm. The meta-algorithm is deployed by first testing the input for low-sensitivity properties and then using the results to select a good algorithm. The result is an end-to-end differentially private system: Pythia, which we show offers improvements over using any single algorithm alone. We empirically demonstrate the benefit of Pythia for the tasks of releasing histograms, answering 1- and 2-dimensional range queries, as well as for constructing private Naive Bayes classifiers.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Differential privacy has emerged as a preferred standard for ensuring privacy in analysis tasks on sensitive datasets. Recent algorithms have allowed for significantly lower error by adapting to properties of the input data. These so-called data-dependent algorithms have different error rates for different inputs. There is now a complex and growing landscape of algorithms without a clear winner that can offer low error over all datasets. As a result, the best possible error rates are not attainable in practice, because the data curator cannot know which algorithm to select prior to actually running the algorithm. We address this challenge by proposing a novel meta-algorithm designed to relieve the data curator of the burden of algorithm selection. It works by learning (from non-sensitive data) the association between dataset properties and the best-performing algorithm. The meta-algorithm is deployed by first testing the input for low-sensitivity properties and then using the results to select a good algorithm. The result is an end-to-end differentially private system: Pythia, which we show offers improvements over using any single algorithm alone. We empirically demonstrate the benefit of Pythia for the tasks of releasing histograms, answering 1- and 2-dimensional range queries, as well as for constructing private Naive Bayes classifiers. |
Michael Hay, Liudmila Elagina, Gerome Miklau Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017., pp. 669–677, 2017. @inproceedings{DBLP:conf/sdm/HayEM17, title = {Differentially Private Rank Aggregation}, author = {Michael Hay and Liudmila Elagina and Gerome Miklau}, url = {https://doi.org/10.1137/1.9781611974973.75}, doi = {10.1137/1.9781611974973.75}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017.}, pages = {669--677}, crossref = {DBLP:conf/sdm/2017}, abstract = {Given a collection of rankings of a set of items, rank aggregation seeks to compute a ranking that can serve as a single best representative of the collection. Rank aggregation is a well-studied problem and a number of effective algorithmic solutions have been proposed in the literature. However, when individuals are asked to contribute a ranking, they may be concerned that their personal preferences will be disclosed inappropriately to others. This acts as a disincentive to individuals to respond honestly in expressing their preferences and impedes data collection and data sharing. We address this problem by investigating rank aggregation under differential privacy, which requires that a released output (here, the aggregate ranking computed from individuals' rankings) remain almost the same if any one individual's ranking is removed from the input. We propose a number of differentially-private rank aggregation algorithms: two are inspired by non-private approximate rank aggregators from the existing literature; another uses a novel rejection sampling method to sample privately from a complex distribution. For all the methods we propose, we quantify, both theoretically and empirically, the “cost” of privacy in terms of the quality of the rank aggregation computed.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Given a collection of rankings of a set of items, rank aggregation seeks to compute a ranking that can serve as a single best representative of the collection. Rank aggregation is a well-studied problem and a number of effective algorithmic solutions have been proposed in the literature. However, when individuals are asked to contribute a ranking, they may be concerned that their personal preferences will be disclosed inappropriately to others. This acts as a disincentive to individuals to respond honestly in expressing their preferences and impedes data collection and data sharing. We address this problem by investigating rank aggregation under differential privacy, which requires that a released output (here, the aggregate ranking computed from individuals' rankings) remain almost the same if any one individual's ranking is removed from the input. We propose a number of differentially-private rank aggregation algorithms: two are inspired by non-private approximate rank aggregators from the existing literature; another uses a novel rejection sampling method to sample privately from a complex distribution. For all the methods we propose, we quantify, both theoretically and empirically, the “cost” of privacy in terms of the quality of the rank aggregation computed. |
Garrett Bernstein, Ryan McKenna, Tao Sun, Daniel Sheldon, Michael Hay, Gerome Miklau Differentially Private Learning of Undirected Graphical Models Using Collective Graphical Models Inproceedings Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017, pp. 478–487, 2017. @inproceedings{DBLP:conf/icml/BernsteinMSSHM17, title = {Differentially Private Learning of Undirected Graphical Models Using Collective Graphical Models}, author = {Garrett Bernstein and Ryan McKenna and Tao Sun and Daniel Sheldon and Michael Hay and Gerome Miklau}, url = {http://proceedings.mlr.press/v70/bernstein17a.html}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017}, pages = {478--487}, crossref = {DBLP:conf/icml/2017}, abstract = {We investigate the problem of learning discrete graphical models in a differentially private way. Approaches to this problem range from privileged algorithms that conduct learning completely behind the privacy barrier to schemes that release private summary statistics paired with algorithms to learn parameters from those statistics. We show that the approach of releasing noisy sufficient statistics using the Laplace mechanism achieves a good trade-off between privacy, utility, and practicality. A naive learning algorithm that uses the noisy sufficient statistics “as is” outperforms general-purpose differentially private learning algorithms. However, it has three limitations: it ignores knowledge about the data generating process, rests on uncertain theoretical foundations, and exhibits certain pathologies. We develop a more principled approach that applies the formalism of collective graphical models to perform inference over the true sufficient statistics within an expectation-maximization framework. We show that this learns better models than competing approaches on both synthetic data and on real human mobility data used as a case study.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We investigate the problem of learning discrete graphical models in a differentially private way. Approaches to this problem range from privileged algorithms that conduct learning completely behind the privacy barrier to schemes that release private summary statistics paired with algorithms to learn parameters from those statistics. We show that the approach of releasing noisy sufficient statistics using the Laplace mechanism achieves a good trade-off between privacy, utility, and practicality. A naive learning algorithm that uses the noisy sufficient statistics “as is” outperforms general-purpose differentially private learning algorithms. However, it has three limitations: it ignores knowledge about the data generating process, rests on uncertain theoretical foundations, and exhibits certain pathologies. We develop a more principled approach that applies the formalism of collective graphical models to perform inference over the true sufficient statistics within an expectation-maximization framework. We show that this learns better models than competing approaches on both synthetic data and on real human mobility data used as a case study. |
Sainyam Galhotra, Yuriy Brun, Alexandra Meliou Fairness Testing: Testing Software for Discrimination Inproceedings Proceedings of 2017 11th Joint Meeting of the European Software Engineering Conference and the ACM SIGSOFT Symposium on the Foundations of Software Engineering, pp. 498–510, 2017, (ACM SIGSOFT Distinguished Paper Award). @inproceedings{GalhotraBM2017, title = {Fairness Testing: Testing Software for Discrimination}, author = {Sainyam Galhotra and Yuriy Brun and Alexandra Meliou}, url = {http://people.cs.umass.edu/~brun/pubs/pubs/Galhotra17fse.pdf}, doi = {10.1145/3106237.3106277}, year = {2017}, date = {2017-09-06}, booktitle = {Proceedings of 2017 11th Joint Meeting of the European Software Engineering Conference and the ACM SIGSOFT Symposium on the Foundations of Software Engineering}, pages = {498--510}, abstract = {This paper defines the notions of software fairness and discrimination and develops a testing-based method for measuring if and how much software discriminates. Specifically, the paper focuses on measuring causality in discriminatory behavior. Modern software contributes to important societal decisions and evidence of software discrimination has been found in systems that recommend criminal sentences, grant access to financial loans and products, and determine who is allowed to participate in promotions and receive services. Our approach, Themis, measures discrimination in software by generating efficient, discrimination-testing test suites. Given a schema describing valid system inputs, Themis generates discrimination tests automatically and, notably, does not require an oracle. We evaluate Themis on 20 software systems, 12 of which come from prior work with explicit focus on avoiding discrimination. We find that (1) Themis is effective at discovering software discrimination, (2) state-of-the-art techniques for removing discrimination from algorithms fail in many situations, at times discriminating against as much as 98% of an input subdomain, (3) Themis optimizations are effective at producing efficient test suites for measuring discrimination, and (4) Themis is more efficient on systems that exhibit more discrimination. We thus demonstrate that fairness testing is a critical aspect of the software development cycle in domains with possible discrimination and provide initial tools for measuring software discrimination.}, note = {ACM SIGSOFT Distinguished Paper Award}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper defines the notions of software fairness and discrimination and develops a testing-based method for measuring if and how much software discriminates. Specifically, the paper focuses on measuring causality in discriminatory behavior. Modern software contributes to important societal decisions and evidence of software discrimination has been found in systems that recommend criminal sentences, grant access to financial loans and products, and determine who is allowed to participate in promotions and receive services. Our approach, Themis, measures discrimination in software by generating efficient, discrimination-testing test suites. Given a schema describing valid system inputs, Themis generates discrimination tests automatically and, notably, does not require an oracle. We evaluate Themis on 20 software systems, 12 of which come from prior work with explicit focus on avoiding discrimination. We find that (1) Themis is effective at discovering software discrimination, (2) state-of-the-art techniques for removing discrimination from algorithms fail in many situations, at times discriminating against as much as 98% of an input subdomain, (3) Themis optimizations are effective at producing efficient test suites for measuring discrimination, and (4) Themis is more efficient on systems that exhibit more discrimination. We thus demonstrate that fairness testing is a critical aspect of the software development cycle in domains with possible discrimination and provide initial tools for measuring software discrimination. |
Matteo Brucato, Azza Abouzied, Alexandra Meliou A Scalable Execution Engine for Package Queries Journal Article SIGMOD Record, 46 (1), pp. 24–31, 2017, ISSN: 0163-5808, (ACM SIGMOD Research Highlight Award). @article{BrucatoAM2017, title = {A Scalable Execution Engine for Package Queries}, author = {Matteo Brucato and Azza Abouzied and Alexandra Meliou}, url = {https://sigmodrecord.org/publications/sigmodRecord/1703/pdfs/08_ASalable_RH_Brucato.pdf}, doi = {10.1145/3093754.3093761}, issn = {0163-5808}, year = {2017}, date = {2017-03-15}, journal = {SIGMOD Record}, volume = {46}, number = {1}, pages = {24--31}, publisher = {ACM}, address = {New York, NY, USA}, abstract = {Many modern applications and real-world problems involve the design of item collections, or packages: from planning your daily meals all the way to mapping the universe. Despite the pervasive need for packages, traditional data management does not offer support for their definition and computation. This is because traditional database queries follow a powerful, but very simple model: a query defines constraints that each tuple in the result must satisfy. However, a system tasked with the design of packages cannot consider items independently; rather, the system needs to determine if a set of items collectively satisfy given criteria. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. Second, we present a fundamental strategy for evaluating package queries that combines the capabilities of databases and constraint optimization solvers. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SKETCHREFINE, an efficient and scalable algorithm for package evaluation, which offers strong approximation guarantees. Finally, we present extensive experiments over real-world data. Our results demonstrate that SKETCHREFINE is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets.}, note = {ACM SIGMOD Research Highlight Award}, keywords = {}, pubstate = {published}, tppubtype = {article} } Many modern applications and real-world problems involve the design of item collections, or packages: from planning your daily meals all the way to mapping the universe. Despite the pervasive need for packages, traditional data management does not offer support for their definition and computation. This is because traditional database queries follow a powerful, but very simple model: a query defines constraints that each tuple in the result must satisfy. However, a system tasked with the design of packages cannot consider items independently; rather, the system needs to determine if a set of items collectively satisfy given criteria. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. Second, we present a fundamental strategy for evaluating package queries that combines the capabilities of databases and constraint optimization solvers. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SKETCHREFINE, an efficient and scalable algorithm for package evaluation, which offers strong approximation guarantees. Finally, we present extensive experiments over real-world data. Our results demonstrate that SKETCHREFINE is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets. |
Xiaolan Wang, Alexandra Meliou, Eugene Wu QFix: Diagnosing errors through query histories Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), pp. 1369–1384, 2017. @inproceedings{WangMW2017, title = {QFix: Diagnosing errors through query histories}, author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu}, url = {http://people.cs.umass.edu/ameli/projects/queryProvenance/papers/WangMW2017.pdf}, doi = {10.1145/2882903.2899388}, year = {2017}, date = {2017-05-14}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, pages = {1369--1384}, abstract = {Data-driven applications rely on the correctness of their data to function properly and effectively. Errors in data can be incredibly costly and disruptive, leading to loss of revenue, incorrect conclusions, and misguided policy decisions. While data cleaning tools can purge datasets of many errors before the data is used, applications and users interacting with the data can introduce new errors. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Even when some of these discrepancies are discovered, they are often corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this paper, we propose QFix, a framework that derives explanations and repairs for discrepancies in relational data, by analyzing the effect of queries that operated on the data and identifying potential mistakes in those queries. QFix is flexible, handling scenarios where only a subset of the true discrepancies is known, and robust to different types of update workloads. We make four important contributions: (a) we formalize the problem of diagnosing the causes of data errors based on the queries that operated on and introduced errors to a dataset; (b) we develop exact methods for deriving diagnoses and fixes for identified errors using state-of-the-art tools; (c) we present several optimization techniques that improve our basic approach without compromising accuracy, and (d) we leverage a tradeoff between accuracy and performance to scale diagnosis to large datasets and query logs, while achieving near-optimal results. We demonstrate the effectiveness of QFix through extensive evaluation over benchmark and synthetic data.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data-driven applications rely on the correctness of their data to function properly and effectively. Errors in data can be incredibly costly and disruptive, leading to loss of revenue, incorrect conclusions, and misguided policy decisions. While data cleaning tools can purge datasets of many errors before the data is used, applications and users interacting with the data can introduce new errors. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Even when some of these discrepancies are discovered, they are often corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this paper, we propose QFix, a framework that derives explanations and repairs for discrepancies in relational data, by analyzing the effect of queries that operated on the data and identifying potential mistakes in those queries. QFix is flexible, handling scenarios where only a subset of the true discrepancies is known, and robust to different types of update workloads. We make four important contributions: (a) we formalize the problem of diagnosing the causes of data errors based on the queries that operated on and introduced errors to a dataset; (b) we develop exact methods for deriving diagnoses and fixes for identified errors using state-of-the-art tools; (c) we present several optimization techniques that improve our basic approach without compromising accuracy, and (d) we leverage a tradeoff between accuracy and performance to scale diagnosis to large datasets and query logs, while achieving near-optimal results. We demonstrate the effectiveness of QFix through extensive evaluation over benchmark and synthetic data. |
Haopeng Zhang, Yanlei Diao, Alexandra Meliou EXstream: Explaining Anomalies in Event Stream Monitoring Inproceedings Proceedings of the 20th International Conference on Extending Database Technology, EDBT 2017, Venice, Italy, March 21-24, 2017., pp. 156–167, 2017. @inproceedings{DBLP:conf/edbt/ZhangDM17, title = {EXstream: Explaining Anomalies in Event Stream Monitoring}, author = {Haopeng Zhang and Yanlei Diao and Alexandra Meliou}, url = {https://people.cs.umass.edu/~ameli/papers/EDBT2017.pdf}, doi = {10.5441/002/edbt.2017.15}, year = {2017}, date = {2017-03-21}, booktitle = {Proceedings of the 20th International Conference on Extending Database Technology, EDBT 2017, Venice, Italy, March 21-24, 2017.}, pages = {156--167}, crossref = {DBLP:conf/edbt/2017}, abstract = {In this paper, we present the EXstream system that provides high-quality explanations for anomalous behaviors that users annotate on CEP-based monitoring results. Given the new requirements for explanations, namely, conciseness, consistency with human interpretation, and prediction power, most existing techniques cannot produce explanations that satisfy all three of them. The key technical contributions of this work include a formal definition of optimally explaining anomalies in CEP monitoring, and three key techniques for generating sufficient feature space, characterizing the contribution of each feature to the explanation, and selecting a small subset of features as the optimal explanation, respectively. Evaluation using two real-world use cases shows that EXstream can outperform existing techniques significantly in conciseness and consistency while achieving comparable high prediction power and retaining a highly efficient implementation of a data stream system.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In this paper, we present the EXstream system that provides high-quality explanations for anomalous behaviors that users annotate on CEP-based monitoring results. Given the new requirements for explanations, namely, conciseness, consistency with human interpretation, and prediction power, most existing techniques cannot produce explanations that satisfy all three of them. The key technical contributions of this work include a formal definition of optimally explaining anomalies in CEP monitoring, and three key techniques for generating sufficient feature space, characterizing the contribution of each feature to the explanation, and selecting a small subset of features as the optimal explanation, respectively. Evaluation using two real-world use cases shows that EXstream can outperform existing techniques significantly in conciseness and consistency while achieving comparable high prediction power and retaining a highly efficient implementation of a data stream system. |
2016 |
Olga Papaemmanouil, Yanlei Diao, Kyriaki Dimitriadou, Liping Peng Interactive Data Exploration via Machine Learning Models Journal Article IEEE Data Eng. Bull., 39 (4), pp. 38–49, 2016. @article{DBLP:journals/debu/PapaemmanouilDD16, title = {Interactive Data Exploration via Machine Learning Models}, author = {Olga Papaemmanouil and Yanlei Diao and Kyriaki Dimitriadou and Liping Peng}, url = {http://sites.computer.org/debull/A16dec/p38.pdf}, year = {2016}, date = {2016-01-01}, journal = {IEEE Data Eng. Bull.}, volume = {39}, number = {4}, pages = {38--49}, abstract = {This article provides an overview of our research on data exploration. Our work aims to facilitate interactive exploration tasks in many big data applications in the scientific, biomedical and healthcare domains. We argue for a shift towards learning-based exploration techniques that automatically steer the user towards interesting data areas based on relevance feedback on database samples, aiming to achieve the goal of identifying all database objects that match the user interest with high efficiency. Our research realizes machine learning theory in the new setting of interactive data exploration and develops new optimizations to support “automated” data exploration with high performance over large databases. In this paper, we discuss a suite of techniques that draw insights from machine learning algorithms to guide the exploration of a big data space and leverage the knowledge of exploration patterns to optimize query processing inside the database.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This article provides an overview of our research on data exploration. Our work aims to facilitate interactive exploration tasks in many big data applications in the scientific, biomedical and healthcare domains. We argue for a shift towards learning-based exploration techniques that automatically steer the user towards interesting data areas based on relevance feedback on database samples, aiming to achieve the goal of identifying all database objects that match the user interest with high efficiency. Our research realizes machine learning theory in the new setting of interactive data exploration and develops new optimizations to support “automated” data exploration with high performance over large databases. In this paper, we discuss a suite of techniques that draw insights from machine learning algorithms to guide the exploration of a big data space and leverage the knowledge of exploration patterns to optimize query processing inside the database. |
Serge Abiteboul, Gerome Miklau, Julia Stoyanovich, Gerhard Weikum Data, Responsibly (Dagstuhl Seminar 16291) Journal Article Dagstuhl Reports, 6 (7), pp. 42–71, 2016. @article{DBLP:journals/dagstuhl-reports/AbiteboulMSW16, title = {Data, Responsibly (Dagstuhl Seminar 16291)}, author = {Serge Abiteboul and Gerome Miklau and Julia Stoyanovich and Gerhard Weikum}, url = {http://dx.doi.org/10.4230/DagRep.6.7.42}, doi = {10.4230/DagRep.6.7.42}, year = {2016}, date = {2016-01-01}, journal = {Dagstuhl Reports}, volume = {6}, number = {7}, pages = {42--71}, abstract = {Big data technology promises to improve people's lives, accelerate scientific discovery and innovation, and bring about positive societal change. Yet, if not used responsibly, large-scale data analysis and data-driven algorithmic decision-making can increase economic inequality, affirm systemic bias, and even destabilize global markets. While the potential benefits of data analysis techniques are well accepted, the importance of using them responsibly - that is, in accordance with ethical and moral norms, and with legal and policy considerations - is not yet part of the mainstream research agenda in computer science. Dagstuhl Seminar "Data, Responsibly" brought together academic and industry researchers from several areas of computer science, including a broad representation of data management, but also data mining, security/privacy, and computer networks, as well as social sciences researchers, data journalists, and those active in government think-tanks and policy initiatives. The goals of the seminar were to assess the state of data analysis in terms of fairness, transparency and diversity, identify new research challenges, and derive an agenda for computer science research and education efforts in responsible data analysis and use. While the topic of the seminar is transdisciplinary in nature, an important goal of the seminar was to identify opportunities for high-impact contributions to this important emergent area specifically from the data management community.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Big data technology promises to improve people's lives, accelerate scientific discovery and innovation, and bring about positive societal change. Yet, if not used responsibly, large-scale data analysis and data-driven algorithmic decision-making can increase economic inequality, affirm systemic bias, and even destabilize global markets. While the potential benefits of data analysis techniques are well accepted, the importance of using them responsibly - that is, in accordance with ethical and moral norms, and with legal and policy considerations - is not yet part of the mainstream research agenda in computer science. Dagstuhl Seminar "Data, Responsibly" brought together academic and industry researchers from several areas of computer science, including a broad representation of data management, but also data mining, security/privacy, and computer networks, as well as social sciences researchers, data journalists, and those active in government think-tanks and policy initiatives. The goals of the seminar were to assess the state of data analysis in terms of fairness, transparency and diversity, identify new research challenges, and derive an agenda for computer science research and education efforts in responsible data analysis and use. While the topic of the seminar is transdisciplinary in nature, an important goal of the seminar was to identify opportunities for high-impact contributions to this important emergent area specifically from the data management community. |
Yue Wang, Alexandra Meliou, Gerome Miklau Lifting the Haze off the Cloud: A Consumer-Centric Market for Database Computation in the Cloud Journal Article PVLDB, 10 (4), pp. 373–384, 2016. @article{DBLP:journals/pvldb/WangMM16, title = {Lifting the Haze off the Cloud: A Consumer-Centric Market for Database Computation in the Cloud}, author = {Yue Wang and Alexandra Meliou and Gerome Miklau}, url = {http://www.vldb.org/pvldb/vol10/p373-wang.pdf}, year = {2016}, date = {2016-01-01}, journal = {PVLDB}, volume = {10}, number = {4}, pages = {373--384}, abstract = {The availability of public computing resources in the cloud has revolutionized data analysis, but requesting cloud resources often involves complex decisions for consumers. Estimating the completion time and cost of a computation and requesting the appropriate cloud resources are challenging tasks even for an expert user. We propose a new market-based framework for pricing computational tasks in the cloud. Our framework introduces an agent between consumers and cloud providers. The agent takes data and computational tasks from users, estimates time and cost for evaluating the tasks, and returns to consumers contracts that specify the price and completion time. Our framework can be applied directly to existing cloud markets without altering the way cloud providers offer and price services. In addition, it simplifies cloud use for consumers by allowing them to compare contracts, rather than choose resources directly. We present design, analytical, and algorithmic contributions focusing on pricing computation contracts, analyzing their properties, and optimizing them in complex workflows. We conduct an experimental evaluation of our market framework over a real-world cloud service and demonstrate empirically that our market ensures three key properties: (a) that consumers benefit from using the market due to competitiveness among agents, (b) that agents have an incentive to price contracts fairly, and (c) that inaccuracies in estimates do not pose a significant risk to agents' profits. Finally, we present a fine-grained pricing mechanism for complex workflows and show that it can increase agent profits by more than an order of magnitude in some cases.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The availability of public computing resources in the cloud has revolutionized data analysis, but requesting cloud resources often involves complex decisions for consumers. Estimating the completion time and cost of a computation and requesting the appropriate cloud resources are challenging tasks even for an expert user. We propose a new market-based framework for pricing computational tasks in the cloud. Our framework introduces an agent between consumers and cloud providers. The agent takes data and computational tasks from users, estimates time and cost for evaluating the tasks, and returns to consumers contracts that specify the price and completion time. Our framework can be applied directly to existing cloud markets without altering the way cloud providers offer and price services. In addition, it simplifies cloud use for consumers by allowing them to compare contracts, rather than choose resources directly. We present design, analytical, and algorithmic contributions focusing on pricing computation contracts, analyzing their properties, and optimizing them in complex workflows. We conduct an experimental evaluation of our market framework over a real-world cloud service and demonstrate empirically that our market ensures three key properties: (a) that consumers benefit from using the market due to competitiveness among agents, (b) that agents have an incentive to price contracts fairly, and (c) that inaccuracies in estimates do not pose a significant risk to agents' profits. Finally, we present a fine-grained pricing mechanism for complex workflows and show that it can increase agent profits by more than an order of magnitude in some cases. |
Kyriaki Dimitriadou, Olga Papaemmanouil, Yanlei Diao AIDE: An Active Learning-Based Approach for Interactive Data Exploration Journal Article IEEE Trans. Knowl. Data Eng., 28 (11), pp. 2842–2856, 2016. @article{DBLP:journals/tkde/DimitriadouPD16, title = {AIDE: An Active Learning-Based Approach for Interactive Data Exploration}, author = {Kyriaki Dimitriadou and Olga Papaemmanouil and Yanlei Diao}, url = {http://dx.doi.org/10.1109/TKDE.2016.2599168}, doi = {10.1109/TKDE.2016.2599168}, year = {2016}, date = {2016-01-01}, journal = {IEEE Trans. Knowl. Data Eng.}, volume = {28}, number = {11}, pages = {2842--2856}, abstract = {In this paper, we argue that database systems be augmented with an automated data exploration service that methodically steers users through the data in a meaningful way. Such an automated system is crucial for deriving insights from complex datasets found in many big data applications such as scientific and healthcare applications as well as for reducing the human effort of data exploration. Towards this end, we present AIDE, an Automatic Interactive Data Exploration framework that assists users in discovering new interesting data patterns and eliminate expensive ad-hoc exploratory queries. AIDE relies on a seamless integration of classification algorithms and data management optimization techniques that collectively strive to accurately learn the user interests based on his relevance feedback on strategically collected samples. We present a number of exploration techniques as well as optimizations that minimize the number of samples presented to the user while offering interactive performance. AIDE can deliver highly accurate query predictions for very common conjunctive queries with small user effort while, given a reasonable number of samples, it can predict with high accuracy complex disjunctive queries. It provides interactive performance as it limits the user wait time per iteration of exploration to less than a few seconds.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In this paper, we argue that database systems be augmented with an automated data exploration service that methodically steers users through the data in a meaningful way. Such an automated system is crucial for deriving insights from complex datasets found in many big data applications such as scientific and healthcare applications as well as for reducing the human effort of data exploration. Towards this end, we present AIDE, an Automatic Interactive Data Exploration framework that assists users in discovering new interesting data patterns and eliminate expensive ad-hoc exploratory queries. AIDE relies on a seamless integration of classification algorithms and data management optimization techniques that collectively strive to accurately learn the user interests based on his relevance feedback on strategically collected samples. We present a number of exploration techniques as well as optimizations that minimize the number of samples presented to the user while offering interactive performance. AIDE can deliver highly accurate query predictions for very common conjunctive queries with small user effort while, given a reasonable number of samples, it can predict with high accuracy complex disjunctive queries. It provides interactive performance as it limits the user wait time per iteration of exploration to less than a few seconds. |
Yanlei Diao, Michael J. Franklin High-Performance XML Message Brokering Incollection Data Stream Management - Processing High-Speed Data Streams, pp. 451–471, 2016. @incollection{DBLP:books/sp/16/DiaoF16, title = {High-Performance XML Message Brokering}, author = {Yanlei Diao and Michael J. Franklin}, url = {http://dx.doi.org/10.1007/978-3-540-28608-0_22}, doi = {10.1007/978-3-540-28608-0_22}, year = {2016}, date = {2016-01-01}, booktitle = {Data Stream Management - Processing High-Speed Data Streams}, pages = {451--471}, crossref = {DBLP:books/sp/GGR2016}, abstract = {For distributed environments including Web Services, data and application integration, and personalized content delivery, XML is becoming the common wire format for data. In this emerging distributed infrastructure, XML message brokers will play a key role as central exchange points for messages sent between applications and/or users. Users (equivalently, applications, or organizations) subscribe to the message broker by providing profiles expressing their data interests. After arriving at the message broker, these profiles become “standing queries,” which are executed on all incoming data. Data sources publish their data by pushing streams of XML messages to the broker. The broker delivers to each user the messages that match his data interests; these messages are presented in the required format of the user. We have developed “YFilter”, an XML filtering system aimed at providing efficient filtering for large numbers (e.g., 10’s or 100’s of thousands) of path queries. The key innovation in YFilter is a Nondeterministic Finite Automaton (NFA)-based representation of path expressions which combines all queries into a single machine. YFilter exploits commonality among path queries by merging the common prefixes of the paths so that they are processed at most once. The NFA-based implementation also provides additional benefits including a relatively small machine size, flexibility in dealing with diverse characteristics of data and queries, incremental machine construction, and ease of maintenance. This work has been supported in part by the National Science Foundation under the ITR grants IIS0086057 and SI0122599 and by Boeing, IBM, Intel, Microsoft, Siemens, and the UC MICRO program.}, keywords = {}, pubstate = {published}, tppubtype = {incollection} } For distributed environments including Web Services, data and application integration, and personalized content delivery, XML is becoming the common wire format for data. In this emerging distributed infrastructure, XML message brokers will play a key role as central exchange points for messages sent between applications and/or users. Users (equivalently, applications, or organizations) subscribe to the message broker by providing profiles expressing their data interests. After arriving at the message broker, these profiles become “standing queries,” which are executed on all incoming data. Data sources publish their data by pushing streams of XML messages to the broker. The broker delivers to each user the messages that match his data interests; these messages are presented in the required format of the user. We have developed “YFilter”, an XML filtering system aimed at providing efficient filtering for large numbers (e.g., 10’s or 100’s of thousands) of path queries. The key innovation in YFilter is a Nondeterministic Finite Automaton (NFA)-based representation of path expressions which combines all queries into a single machine. YFilter exploits commonality among path queries by merging the common prefixes of the paths so that they are processed at most once. The NFA-based implementation also provides additional benefits including a relatively small machine size, flexibility in dealing with diverse characteristics of data and queries, incremental machine construction, and ease of maintenance. This work has been supported in part by the National Science Foundation under the ITR grants IIS0086057 and SI0122599 and by Boeing, IBM, Intel, Microsoft, Siemens, and the UC MICRO program. |
Julia Stoyanovich, Serge Abiteboul, Gerome Miklau Data Responsibly: Fairness, Neutrality and Transparency in Data Analysis Inproceedings Proceedings of the 19th International Conference on Extending Database Technology, EDBT 2016, Bordeaux, France, March 15-16, 2016, Bordeaux, France, March 15-16, 2016., pp. 718–719, 2016. @inproceedings{DBLP:conf/edbt/StoyanovichAM16, title = {Data Responsibly: Fairness, Neutrality and Transparency in Data Analysis}, author = {Julia Stoyanovich and Serge Abiteboul and Gerome Miklau}, url = {http://dx.doi.org/10.5441/002/edbt.2016.103}, doi = {10.5441/002/edbt.2016.103}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 19th International Conference on Extending Database Technology, EDBT 2016, Bordeaux, France, March 15-16, 2016, Bordeaux, France, March 15-16, 2016.}, pages = {718--719}, crossref = {DBLP:conf/edbt/2016}, abstract = {Big data technology holds incredible promise of improving people’s lives, accelerating scientific discovery and innovation, and bringing about positive societal change. Yet, if not used responsibly, this technology can propel economic inequality, destabilize global markets and affirm systemic bias. While the potential benefits of big data are well-accepted, the importance of using these techniques in a fair and transparent manner is rarely considered. The primary goal of this tutorial is to draw the attention of the data management community to the important emerging subject of responsible data management and analysis. We will offer our perspective on the issue, will give an overview of existing technical work, primarily from the data mining and algorithms communities, and will motivate future research directions.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Big data technology holds incredible promise of improving people’s lives, accelerating scientific discovery and innovation, and bringing about positive societal change. Yet, if not used responsibly, this technology can propel economic inequality, destabilize global markets and affirm systemic bias. While the potential benefits of big data are well-accepted, the importance of using these techniques in a fair and transparent manner is rarely considered. The primary goal of this tutorial is to draw the attention of the data management community to the important emerging subject of responsible data management and analysis. We will offer our perspective on the issue, will give an overview of existing technical work, primarily from the data mining and algorithms communities, and will motivate future research directions. |
Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Yan Chen, Dan Zhang Principled Evaluation of Differentially Private Algorithms using DPBench Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 139–154, 2016. @inproceedings{DBLP:conf/sigmod/HayMMCZ16, title = {Principled Evaluation of Differentially Private Algorithms using DPBench}, author = {Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Yan Chen and Dan Zhang}, url = {http://doi.acm.org/10.1145/2882903.2882931}, doi = {10.1145/2882903.2882931}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016}, pages = {139--154}, crossref = {DBLP:conf/sigmod/2016}, abstract = {Differential privacy has become the dominant standard in the research community for strong privacy protection. There has been a flood of research into query answering algorithms that meet this standard. Algorithms are becoming increasingly complex, and in particular, the performance of many emerging algorithms is data dependent, meaning the distribution of the noise added to query answers may change depending on the input data. Theoretical analysis typically only considers the worst case, making empirical study of average case performance increasingly important. In this paper we propose a set of evaluation principles which we argue are essential for sound evaluation. Based on these principles we propose DPBench, a novel evaluation framework for standardized evaluation of privacy algorithms. We then apply our benchmark to evaluate algorithms for answering 1- and 2-dimensional range queries. The result is a thorough empirical study of 15 published algorithms on a total of 27 datasets that offers new insights into algorithm behavior---in particular the influence of dataset scale and shape---and a more complete characterization of the state of the art. Our methodology is able to resolve inconsistencies in prior empirical studies and place algorithm performance in context through comparison to simple baselines. Finally, we pose open research questions which we hope will guide future algorithm design.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Differential privacy has become the dominant standard in the research community for strong privacy protection. There has been a flood of research into query answering algorithms that meet this standard. Algorithms are becoming increasingly complex, and in particular, the performance of many emerging algorithms is data dependent, meaning the distribution of the noise added to query answers may change depending on the input data. Theoretical analysis typically only considers the worst case, making empirical study of average case performance increasingly important. In this paper we propose a set of evaluation principles which we argue are essential for sound evaluation. Based on these principles we propose DPBench, a novel evaluation framework for standardized evaluation of privacy algorithms. We then apply our benchmark to evaluate algorithms for answering 1- and 2-dimensional range queries. The result is a thorough empirical study of 15 published algorithms on a total of 27 datasets that offers new insights into algorithm behavior---in particular the influence of dataset scale and shape---and a more complete characterization of the state of the art. Our methodology is able to resolve inconsistencies in prior empirical studies and place algorithm performance in context through comparison to simple baselines. Finally, we pose open research questions which we hope will guide future algorithm design. |
Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Yan Chen, Dan Zhang, George Bissias Exploring Privacy-Accuracy Tradeoffs using DPComp Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2101–2104, 2016. @inproceedings{DBLP:conf/sigmod/HayMMCZB16, title = {Exploring Privacy-Accuracy Tradeoffs using DPComp}, author = {Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Yan Chen and Dan Zhang and George Bissias}, url = {http://doi.acm.org/10.1145/2882903.2899387}, doi = {10.1145/2882903.2899387}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016}, pages = {2101--2104}, crossref = {DBLP:conf/sigmod/2016}, abstract = {The emergence of differential privacy as a primary standard for privacy protection has led to the development, by the research community, of hundreds of algorithms for various data analysis tasks. Yet deployment of these techniques has been slowed by the complexity of algorithms and an incomplete understanding of the cost to accuracy implied by the adoption of differential privacy. In this demonstration we present DPComp, a publicly-accessible web-based system, designed to support a broad community of users, including data analysts, privacy researchers, and data owners. Users can use DPComp to assess the accuracy of state-of-the-art privacy algorithms and interactively explore algorithm output in order to understand, both quantitatively and qualitatively, the error introduced by the algorithms. In addition, users can contribute new algorithms and new (non-sensitive) datasets. DPComp automatically incorporates user contributions into an evolving benchmark based on a rigorous evaluation methodology articulated by Hay et al. }, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The emergence of differential privacy as a primary standard for privacy protection has led to the development, by the research community, of hundreds of algorithms for various data analysis tasks. Yet deployment of these techniques has been slowed by the complexity of algorithms and an incomplete understanding of the cost to accuracy implied by the adoption of differential privacy. In this demonstration we present DPComp, a publicly-accessible web-based system, designed to support a broad community of users, including data analysts, privacy researchers, and data owners. Users can use DPComp to assess the accuracy of state-of-the-art privacy algorithms and interactively explore algorithm output in order to understand, both quantitatively and qualitatively, the error introduced by the algorithms. In addition, users can contribute new algorithms and new (non-sensitive) datasets. DPComp automatically incorporates user contributions into an evolving benchmark based on a rigorous evaluation methodology articulated by Hay et al. |
Matteo Brucato, Juan Felipe Beltran, Azza Abouzied, Alexandra Meliou Scalable Package Queries in Relational Database Systems Journal Article PVLDB, 9 (7), pp. 576–587, 2016, (Best of VLDB 2016). @article{DBLP:journals/pvldb/BrucatoBAM16, title = {Scalable Package Queries in Relational Database Systems}, author = {Matteo Brucato and Juan Felipe Beltran and Azza Abouzied and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol9/p576-brucato.pdf}, year = {2016}, date = {2016-01-01}, journal = {PVLDB}, volume = {9}, number = {7}, pages = {576--587}, abstract = {Traditional database queries follow a simple model: they define constraints that each tuple in the result must satisfy. This model is computationally efficient, as the database system can evaluate the query conditions on each tuple individually. However, many practical, real-world problems require a collection of result tuples to satisfy constraints collectively, rather than individually. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. We prove that PaQL is at least as expressive as integer linear programming, and therefore, evaluation of package queries is in general NP-hard. Second, we present a fundamental evaluation strategy that combines the capabilities of databases and constraint optimization solvers to derive solutions to package queries. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SketchRefine, a scalable algorithm for package evaluation, with strong approximation guarantees ((1±ε)^6-factor approximation). Finally, we present extensive experiments over real-world and benchmark data. The results demonstrate that SketchRefine is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets.}, note = {Best of VLDB 2016}, keywords = {}, pubstate = {published}, tppubtype = {article} } Traditional database queries follow a simple model: they define constraints that each tuple in the result must satisfy. This model is computationally efficient, as the database system can evaluate the query conditions on each tuple individually. However, many practical, real-world problems require a collection of result tuples to satisfy constraints collectively, rather than individually. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. We prove that PaQL is at least as expressive as integer linear programming, and therefore, evaluation of package queries is in general NP-hard. Second, we present a fundamental evaluation strategy that combines the capabilities of databases and constraint optimization solvers to derive solutions to package queries. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SketchRefine, a scalable algorithm for package evaluation, with strong approximation guarantees ((1±ε)^6-factor approximation). Finally, we present extensive experiments over real-world and benchmark data. The results demonstrate that SketchRefine is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets. |
Xiaolan Wang, Alexandra Meliou, Eugene Wu QFix: Demonstrating Error Diagnosis in Query Histories Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2177–2180, 2016. @inproceedings{DBLP:conf/sigmod/WangM016, title = {QFix: Demonstrating Error Diagnosis in Query Histories}, author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu}, url = {http://doi.acm.org/10.1145/2882903.2899388}, doi = {10.1145/2882903.2899388}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016}, pages = {2177--2180}, crossref = {DBLP:conf/sigmod/2016}, abstract = {An increasing number of applications in all aspects of society rely on data. Despite the long line of research in data cleaning and repairs, data correctness has been an elusive goal. Errors in the data can be extremely disruptive, and are detrimental to the effectiveness and proper function of data-driven applications. Even when data is cleaned, new errors can be introduced by applications and users who interact with the data. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Any discovered errors tend to be corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this demo proposal, we outline the design of QFix, a query-centric framework that derives explanations and repairs for discrepancies in relational data based on potential errors in the queries that operated on the data. This is a marked departure from traditional data-centric techniques that directly fix the data. We then describe how users will use QFix in a demonstration scenario. Participants will be able to select from a number of transactional benchmarks, introduce errors into the queries that are executed, and compare the fixes to the queries proposed by QFix as well as existing alternative algorithms such as decision trees.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } An increasing number of applications in all aspects of society rely on data. Despite the long line of research in data cleaning and repairs, data correctness has been an elusive goal. Errors in the data can be extremely disruptive, and are detrimental to the effectiveness and proper function of data-driven applications. Even when data is cleaned, new errors can be introduced by applications and users who interact with the data. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Any discovered errors tend to be corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this demo proposal, we outline the design of QFix, a query-centric framework that derives explanations and repairs for discrepancies in relational data based on potential errors in the queries that operated on the data. This is a marked departure from traditional data-centric techniques that directly fix the data. We then describe how users will use QFix in a demonstration scenario. Participants will be able to select from a number of transactional benchmarks, introduce errors into the queries that are executed, and compare the fixes to the queries proposed by QFix as well as existing alternative algorithms such as decision trees. |
Xiaolan Wang, Alexandra Meliou, Eugene Wu QFix: Diagnosing errors through query histories Journal Article CoRR, abs/1601.07539 , 2016. @article{DBLP:journals/corr/WangMW16, title = {QFix: Diagnosing errors through query histories}, author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu}, url = {http://arxiv.org/abs/1601.07539}, year = {2016}, date = {2016-01-01}, journal = {CoRR}, volume = {abs/1601.07539}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Yue Wang, Alexandra Meliou, Gerome Miklau A Consumer-Centric Market for Database Computation in the Cloud Journal Article CoRR, abs/1609.02104 , 2016. @article{DBLP:journals/corr/WangMM16, title = {A Consumer-Centric Market for Database Computation in the Cloud}, author = {Yue Wang and Alexandra Meliou and Gerome Miklau}, url = {http://arxiv.org/abs/1609.02104}, year = {2016}, date = {2016-01-01}, journal = {CoRR}, volume = {abs/1609.02104}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
2015 |
Yanlei Diao Explore-By-Example: A New Database Service for Interactive Data Exploration Inproceedings Proceedings of the Second International Workshop on Exploratory Search in Databases and the Web, ExploreDB 2015, Melbourne, VIC, Australia, May 31 - June 04, 2015, pp. 1, 2015. @inproceedings{DBLP:conf/sigmod/Diao15, title = {Explore-By-Example: A New Database Service for Interactive Data Exploration}, author = {Yanlei Diao}, url = {http://doi.acm.org/10.1145/2795218.2795226}, doi = {10.1145/2795218.2795226}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the Second International Workshop on Exploratory Search in Databases and the Web, ExploreDB 2015, Melbourne, VIC, Australia, May 31 - June 04, 2015}, pages = {1}, crossref = {DBLP:conf/sigmod/2015exploredb}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Yan Chen, Dan Zhang Principled Evaluation of Differentially Private Algorithms using DPBench Journal Article CoRR, abs/1512.04817 , 2015. @article{DBLP:journals/corr/HayMMCZ15, title = {Principled Evaluation of Differentially Private Algorithms using DPBench}, author = {Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Yan Chen and Dan Zhang}, url = {http://arxiv.org/abs/1512.04817}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1512.04817}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Yanlei Diao, Abhishek Roy, Toby Bloom Building Highly-Optimized, Low-Latency Pipelines for Genomic Data Analysis Inproceedings CIDR 2015, Seventh Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 4-7, 2015, Online Proceedings, 2015. @inproceedings{DBLP:conf/cidr/DiaoRB15, title = {Building Highly-Optimized, Low-Latency Pipelines for Genomic Data Analysis}, author = {Yanlei Diao and Abhishek Roy and Toby Bloom}, url = {http://www.cidrdb.org/cidr2015/Papers/CIDR15_Paper14u.pdf}, year = {2015}, date = {2015-01-01}, booktitle = {CIDR 2015, Seventh Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 4-7, 2015, Online Proceedings}, crossref = {DBLP:conf/cidr/2015}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Cibele Freire, Wolfgang Gatterbauer, Neil Immerman, Alexandra Meliou The Complexity of Resilience and Responsibility for Self-Join-Free Conjunctive Queries Journal Article PVLDB, 9 (3), pp. 180–191, 2015. @article{DBLP:journals/pvldb/FreireGIM15, title = {The Complexity of Resilience and Responsibility for Self-Join-Free Conjunctive Queries}, author = {Cibele Freire and Wolfgang Gatterbauer and Neil Immerman and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol9/p180-freire.pdf}, year = {2015}, date = {2015-01-01}, journal = {PVLDB}, volume = {9}, number = {3}, pages = {180--191}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Cibele Freire, Wolfgang Gatterbauer, Neil Immerman, Alexandra Meliou A Characterization of the Complexity of Resilience and Responsibility for Self-join-free Conjunctive Queries Journal Article CoRR, abs/1507.00674 , 2015. @article{DBLP:journals/corr/FreireGIM15b, title = {A Characterization of the Complexity of Resilience and Responsibility for Self-join-free Conjunctive Queries}, author = {Cibele Freire and Wolfgang Gatterbauer and Neil Immerman and Alexandra Meliou}, url = {http://arxiv.org/abs/1507.00674}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1507.00674}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Azza Abouzied, Alexandra Meliou Improving package recommendations through query relaxation Journal Article CoRR, abs/1507.00819 , 2015. @article{DBLP:journals/corr/BrucatoAM15, title = {Improving package recommendations through query relaxation}, author = {Matteo Brucato and Azza Abouzied and Alexandra Meliou}, url = {http://arxiv.org/abs/1507.00819}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1507.00819}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Rahul Ramakrishna, Azza Abouzied, Alexandra Meliou PackageBuilder: From Tuples to Packages Journal Article CoRR, abs/1507.00942 , 2015. @article{DBLP:journals/corr/BrucatoRAM15, title = {PackageBuilder: From Tuples to Packages}, author = {Matteo Brucato and Rahul Ramakrishna and Azza Abouzied and Alexandra Meliou}, url = {http://arxiv.org/abs/1507.00942}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1507.00942}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Juan Felipe Beltran, Azza Abouzied, Alexandra Meliou Scalable Package Queries in Relational Database Systems Journal Article CoRR, abs/1512.03564 , 2015. @article{DBLP:journals/corr/BrucatoBAM15, title = {Scalable Package Queries in Relational Database Systems}, author = {Matteo Brucato and Juan Felipe Beltran and Azza Abouzied and Alexandra Meliou}, url = {http://arxiv.org/abs/1512.03564}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1512.03564}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Chao Li, Gerome Miklau, Michael Hay, Andrew McGregor, Vibhor Rastogi The matrix mechanism: optimizing linear counting queries under differential privacy Journal Article The VLDB Journal, pp. 1-25, 2015, ISSN: 1066-8888. @article{b, title = {The matrix mechanism: optimizing linear counting queries under differential privacy}, author = { Chao Li and Gerome Miklau and Michael Hay and Andrew McGregor and Vibhor Rastogi}, url = {http://dx.doi.org/10.1007/s00778-015-0398-x}, doi = {10.1007/s00778-015-0398-x}, issn = {1066-8888}, year = {2015}, date = {2015-01-01}, journal = {The VLDB Journal}, pages = {1-25}, publisher = {Springer Berlin Heidelberg}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Li, Chao, Miklau, Gerome Lower Bounds on the Error of Query Sets Under the Differentially-Private Matrix Mechanism Journal Article Theory of Computing Systems, pp. 1-43, 2015, ISSN: 1432-4350. @article{, title = {Lower Bounds on the Error of Query Sets Under the Differentially-Private Matrix Mechanism}, author = {Li, Chao and Miklau, Gerome}, url = {http://dx.doi.org/10.1007/s00224-015-9610-z}, doi = {10.1007/s00224-015-9610-z}, issn = {1432-4350}, year = {2015}, date = {2015-01-01}, journal = {Theory of Computing Systems}, pages = {1-43}, publisher = {Springer US}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Ravali Pochampally, Anish Das Sarma, Xin Luna Dong, Alexandra Meliou, Divesh Srivastava Fusing Data with Correlations Journal Article CoRR, abs/1503.00306 , 2015. @article{DBLP:journals/corr/PochampallySDMS15, title = {Fusing Data with Correlations}, author = {Ravali Pochampally and Anish Das Sarma and Xin Luna Dong and Alexandra Meliou and Divesh Srivastava}, url = {http://arxiv.org/abs/1503.00306}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1503.00306}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Xiaolan Wang, Mary Feng, Yue Wang, Xin Luna Dong, Alexandra Meliou Error Diagnosis and Data Profiling with Data X-Ray Journal Article PVLDB, 8 (12), pp. 1984–1995, 2015. @article{DBLP:journals/pvldb/WangFWDM15, title = {Error Diagnosis and Data Profiling with Data X-Ray}, author = {Xiaolan Wang and Mary Feng and Yue Wang and Xin Luna Dong and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol8/p1984-wang.pdf}, year = {2015}, date = {2015-01-01}, journal = {PVLDB}, volume = {8}, number = {12}, pages = {1984--1995}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Yanlei Diao, Kyriaki Dimitriadou, Zhan Li, Wenzhao Liu, Olga Papaemmanouil, Kemi Peng, Liping Peng AIDE: An Automatic User Navigation System for Interactive Data Exploration Journal Article PVLDB, 8 (12), pp. 1964–1975, 2015. @article{DBLP:journals/pvldb/DiaoDLLPPP15, title = {AIDE: An Automatic User Navigation System for Interactive Data Exploration}, author = {Yanlei Diao and Kyriaki Dimitriadou and Zhan Li and Wenzhao Liu and Olga Papaemmanouil and Kemi Peng and Liping Peng}, url = {http://www.vldb.org/pvldb/vol8/p1964-Diao.pdf}, year = {2015}, date = {2015-01-01}, journal = {PVLDB}, volume = {8}, number = {12}, pages = {1964--1975}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Boduo Li, Yanlei Diao, Prashant J. Shenoy Supporting Scalable Analytics with Latency Constraints Journal Article PVLDB, 8 (11), pp. 1166–1177, 2015. @article{DBLP:journals/pvldb/LiDS15, title = {Supporting Scalable Analytics with Latency Constraints}, author = {Boduo Li and Yanlei Diao and Prashant J. Shenoy}, url = {http://www.vldb.org/pvldb/vol8/p1166-li.pdf}, year = {2015}, date = {2015-01-01}, journal = {PVLDB}, volume = {8}, number = {11}, pages = {1166--1177}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Liping Peng, Yanlei Diao Supporting Data Uncertainty in Array Databases Inproceedings Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015, pp. 545–560, 2015. @inproceedings{DBLP:conf/sigmod/PengD15, title = {Supporting Data Uncertainty in Array Databases}, author = {Liping Peng and Yanlei Diao}, url = {http://doi.acm.org/10.1145/2723372.2723738}, doi = {10.1145/2723372.2723738}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015}, pages = {545--560}, crossref = {DBLP:conf/sigmod/2015}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Vera Zaychik Moffitt, Julia Stoyanovich, Serge Abiteboul, Gerome Miklau Collaborative Access Control in WebdamLog Inproceedings Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015, pp. 197–211, 2015. @inproceedings{DBLP:conf/sigmod/MoffittSAM15, title = {Collaborative Access Control in WebdamLog}, author = {Vera Zaychik Moffitt and Julia Stoyanovich and Serge Abiteboul and Gerome Miklau}, url = {http://doi.acm.org/10.1145/2723372.2749433}, doi = {10.1145/2723372.2749433}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015}, pages = {197--211}, crossref = {DBLP:conf/sigmod/2015}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Xiaolan Wang, Xin Luna Dong, Alexandra Meliou Data X-Ray: A Diagnostic Tool for Data Errors Inproceedings Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015, pp. 1231–1245, 2015. @inproceedings{DBLP:conf/sigmod/WangDM15, title = {Data X-Ray: A Diagnostic Tool for Data Errors}, author = {Xiaolan Wang and Xin Luna Dong and Alexandra Meliou}, url = {http://doi.acm.org/10.1145/2723372.2750549}, doi = {10.1145/2723372.2750549}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015}, pages = {1231--1245}, crossref = {DBLP:conf/sigmod/2015}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Kivanç Muslu, Yuriy Brun, Alexandra Meliou Preventing data errors with continuous testing Inproceedings Proceedings of the 2015 International Symposium on Software Testing and Analysis, ISSTA 2015, Baltimore, MD, USA, July 12-17, 2015, pp. 373–384, 2015. @inproceedings{DBLP:conf/issta/MusluBM15, title = {Preventing data errors with continuous testing}, author = {Kivanç Muslu and Yuriy Brun and Alexandra Meliou}, url = {http://doi.acm.org/10.1145/2771783.2771792}, doi = {10.1145/2771783.2771792}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings of the 2015 International Symposium on Software Testing and Analysis, ISSTA 2015, Baltimore, MD, USA, July 12-17, 2015}, pages = {373--384}, crossref = {DBLP:conf/issta/2015}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2014 |
Chao Li, Daniel Yang Li, Gerome Miklau, Dan Suciu A Theory of Pricing Private Data Journal Article ACM Trans. Database Syst., 39 (4), pp. 34, 2014. @article{DBLP:journals/tods/LiLMS14, title = {A Theory of Pricing Private Data}, author = {Chao Li and Daniel Yang Li and Gerome Miklau and Dan Suciu}, url = {http://doi.acm.org/10.1145/2691190.2691191}, doi = {10.1145/2691190.2691191}, year = {2014}, date = {2014-01-01}, journal = {ACM Trans. Database Syst.}, volume = {39}, number = {4}, pages = {34}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Alexandra Meliou, Sudeepa Roy, Dan Suciu Causality and Explanations in Databases Journal Article PVLDB, 7 (13), pp. 1715–1716, 2014. @article{DBLP:journals/pvldb/MeliouRS14, title = {Causality and Explanations in Databases}, author = {Alexandra Meliou and Sudeepa Roy and Dan Suciu}, url = {http://www.vldb.org/pvldb/vol7/p1715-meliou.pdf}, year = {2014}, date = {2014-01-01}, journal = {PVLDB}, volume = {7}, number = {13}, pages = {1715--1716}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Ravali Pochampally, Anish Das Sarma, Xin Luna Dong, Alexandra Meliou, Divesh Srivastava Fusing data with correlations Inproceedings International Conference on Management of Data, SIGMOD 2014, Snowbird, UT, USA, June 22-27, 2014, pp. 433–444, 2014. @inproceedings{DBLP:conf/sigmod/PochampallySDMS14, title = {Fusing data with correlations}, author = {Ravali Pochampally and Anish Das Sarma and Xin Luna Dong and Alexandra Meliou and Divesh Srivastava}, url = {http://doi.acm.org/10.1145/2588555.2593674}, doi = {10.1145/2588555.2593674}, year = {2014}, date = {2014-01-01}, booktitle = {International Conference on Management of Data, SIGMOD 2014, Snowbird, UT, USA, June 22-27, 2014}, pages = {433--444}, crossref = {DBLP:conf/sigmod/2014}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Matteo Brucato, Azza Abouzied, Alexandra Meliou Improving package recommendations through query relaxation Inproceedings Proceedings of the First International Workshop on Bringing the Value of "Big Data" to Users, Data4U@VLDB 2014, Hangzhou, China, September 1, 2014, pp. 13, 2014. @inproceedings{DBLP:conf/vldb/BrucatoAM14, title = {Improving package recommendations through query relaxation}, author = {Matteo Brucato and Azza Abouzied and Alexandra Meliou}, url = {http://doi.acm.org/10.1145/2658840.2658843}, doi = {10.1145/2658840.2658843}, year = {2014}, date = {2014-01-01}, booktitle = {Proceedings of the First International Workshop on Bringing the Value of "Big Data" to Users, Data4U@VLDB 2014, Hangzhou, China, September 1, 2014}, pages = {13}, crossref = {DBLP:conf/vldb/2014data4u}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
2017 |
Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017, pp. 669–677, 2017. |
PeGaSus: Data-Adaptive Differentially Private Stream Processing Inproceedings Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS 2017, Dallas, TX, USA, October 30 - November 03, 2017, pp. 1375–1388, 2017. |
A theory of pricing private data Journal Article Commun. ACM, 60 (12), pp. 79–86, 2017. |
Redistributing Funds across Charitable Crowdfunding Campaigns Journal Article CoRR, abs/1706.00070 , 2017. |
Compressed linear algebra for large-scale machine learning Journal Article The VLDB Journal, 2017, ISSN: 0949-877X. |
Massively Parallel Processing of Whole Genome Sequence Data: An In-Depth Performance Study Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 187–202, 2017. |
Differentially Private Learning of Undirected Graphical Models using Collective Graphical Models Journal Article CoRR, abs/1706.04646 , 2017. |
Fides: Towards a Platform for Responsible Data Science Inproceedings Proceedings of the 29th International Conference on Scientific and Statistical Database Management, Chicago, IL, USA, June 27-29, 2017, pp. 26:1–26:6, 2017. |
DIAS: Differentially Private Interactive Algorithm Selection using Pythia Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1679–1682, 2017. |
Pythia: Data Dependent Differentially Private Algorithm Selection Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1323–1337, 2017. |
Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017., pp. 669–677, 2017. |
Differentially Private Learning of Undirected Graphical Models Using Collective Graphical Models Inproceedings Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017, pp. 478–487, 2017. |
Fairness Testing: Testing Software for Discrimination Inproceedings Proceedings of 2017 11th Joint Meeting of the European Software Engineering Conference and the ACM SIGSOFT Symposium on the Foundations of Software Engineering, pp. 498–510, 2017, (ACM SIGSOFT Distinguished Paper Award). |
A Scalable Execution Engine for Package Queries Journal Article SIGMOD Record, 46 (1), pp. 24–31, 2017, ISSN: 0163-5808, (ACM SIGMOD Research Highlight Award). |
QFix: Diagnosing errors through query histories Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), pp. 1369–1384, 2017. |
EXstream: Explaining Anomalies in Event Stream Monitoring Inproceedings Proceedings of the 20th International Conference on Extending Database Technology, EDBT 2017, Venice, Italy, March 21-24, 2017., pp. 156–167, 2017. |
2016 |
Interactive Data Exploration via Machine Learning Models Journal Article IEEE Data Eng. Bull., 39 (4), pp. 38–49, 2016. |
Data, Responsibly (Dagstuhl Seminar 16291) Journal Article Dagstuhl Reports, 6 (7), pp. 42–71, 2016. |
Lifting the Haze off the Cloud: A Consumer-Centric Market for Database Computation in the Cloud Journal Article PVLDB, 10 (4), pp. 373–384, 2016. |
AIDE: An Active Learning-Based Approach for Interactive Data Exploration Journal Article IEEE Trans. Knowl. Data Eng., 28 (11), pp. 2842–2856, 2016. |
High-Performance XML Message Brokering Incollection Data Stream Management - Processing High-Speed Data Streams, pp. 451–471, 2016. |
Data Responsibly: Fairness, Neutrality and Transparency in Data Analysis Inproceedings Proceedings of the 19th International Conference on Extending Database Technology, EDBT 2016, Bordeaux, France, March 15-16, 2016, Bordeaux, France, March 15-16, 2016., pp. 718–719, 2016. |
Principled Evaluation of Differentially Private Algorithms using DPBench Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 139–154, 2016. |
Exploring Privacy-Accuracy Tradeoffs using DPComp Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2101–2104, 2016. |
Scalable Package Queries in Relational Database Systems Journal Article PVLDB, 9 (7), pp. 576–587, 2016, (Best of VLDB 2016). |
QFix: Demonstrating Error Diagnosis in Query Histories Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2177–2180, 2016. |
QFix: Diagnosing errors through query histories Journal Article CoRR, abs/1601.07539 , 2016. |
A Consumer-Centric Market for Database Computation in the Cloud Journal Article CoRR, abs/1609.02104 , 2016. |
2015 |
Explore-By-Example: A New Database Service for Interactive Data Exploration Inproceedings Proceedings of the Second International Workshop on Exploratory Search in Databases and the Web, ExploreDB 2015, Melbourne, VIC, Australia, May 31 - June 04, 2015, pp. 1, 2015. |
Principled Evaluation of Differentially Private Algorithms using DPBench Journal Article CoRR, abs/1512.04817 , 2015. |
Building Highly-Optimized, Low-Latency Pipelines for Genomic Data Analysis Inproceedings CIDR 2015, Seventh Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 4-7, 2015, Online Proceedings, 2015. |
The Complexity of Resilience and Responsibility for Self-Join-Free Conjunctive Queries Journal Article PVLDB, 9 (3), pp. 180–191, 2015. |
A Characterization of the Complexity of Resilience and Responsibility for Self-join-free Conjunctive Queries Journal Article CoRR, abs/1507.00674 , 2015. |
Improving package recommendations through query relaxation Journal Article CoRR, abs/1507.00819 , 2015. |
PackageBuilder: From Tuples to Packages Journal Article CoRR, abs/1507.00942 , 2015. |
Scalable Package Queries in Relational Database Systems Journal Article CoRR, abs/1512.03564 , 2015. |
The matrix mechanism: optimizing linear counting queries under differential privacy Journal Article The VLDB Journal, pp. 1-25, 2015, ISSN: 1066-8888. |
Lower Bounds on the Error of Query Sets Under the Differentially-Private Matrix Mechanism Journal Article Theory of Computing Systems, pp. 1-43, 2015, ISSN: 1432-4350. |
Fusing Data with Correlations Journal Article CoRR, abs/1503.00306 , 2015. |
Error Diagnosis and Data Profiling with Data X-Ray Journal Article PVLDB, 8 (12), pp. 1984–1995, 2015. |
AIDE: An Automatic User Navigation System for Interactive Data Exploration Journal Article PVLDB, 8 (12), pp. 1964–1975, 2015. |
Supporting Scalable Analytics with Latency Constraints Journal Article PVLDB, 8 (11), pp. 1166–1177, 2015. |
Supporting Data Uncertainty in Array Databases Inproceedings Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015, pp. 545–560, 2015. |
Collaborative Access Control in WebdamLog Inproceedings Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015, pp. 197–211, 2015. |
Data X-Ray: A Diagnostic Tool for Data Errors Inproceedings Proceedings of the 2015 ACM SIGMOD International Conference on Management of Data, Melbourne, Victoria, Australia, May 31 - June 4, 2015, pp. 1231–1245, 2015. |
Preventing data errors with continuous testing Inproceedings Proceedings of the 2015 International Symposium on Software Testing and Analysis, ISSTA 2015, Baltimore, MD, USA, July 12-17, 2015, pp. 373–384, 2015. |
2014 |
A Theory of Pricing Private Data Journal Article ACM Trans. Database Syst., 39 (4), pp. 34, 2014. |
Causality and Explanations in Databases Journal Article PVLDB, 7 (13), pp. 1715–1716, 2014. |
Fusing data with correlations Inproceedings International Conference on Management of Data, SIGMOD 2014, Snowbird, UT, USA, June 22-27, 2014, pp. 433–444, 2014. |
Improving package recommendations through query relaxation Inproceedings Proceedings of the First International Workshop on Bringing the Value of "Big Data" to Users, Data4U@VLDB 2014, Hangzhou, China, September 1, 2014, pp. 13, 2014. |