2019 |
Ios Kotsogiannis, Yuchao Tao, Ashwin Machanavajjhala, Gerome Miklau, Michael Hay Architecting a Differentially Private SQL Engine Inproceedings CIDR 2019, 9th Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings, 2019. @inproceedings{DBLP:conf/cidr/KotsogiannisTMM19, title = {Architecting a Differentially Private SQL Engine}, author = {Ios Kotsogiannis and Yuchao Tao and Ashwin Machanavajjhala and Gerome Miklau and Michael Hay}, url = {http://cidrdb.org/cidr2019/papers/p125-kotsogiannis-cidr19.pdf}, year = {2019}, date = {2019-01-01}, booktitle = {CIDR 2019, 9th Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Brian Hentschel, Peter J Haas, Yuanyuan Tian General Temporally Biased Sampling Schemes for Online Model Management Journal Article ACM Trans. Database Syst., 44 (4), pp. 14:1–14:45, 2019. @article{DBLP:journals/tods/HentschelHT19, title = {General Temporally Biased Sampling Schemes for Online Model Management}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {https://doi.org/10.1145/3360903}, doi = {10.1145/3360903}, year = {2019}, date = {2019-12-24}, journal = {ACM Trans. Database Syst.}, volume = {44}, number = {4}, pages = {14:1--14:45}, abstract = {To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying over time according to a specified “decay function.” We then periodically retrain the models on the current sample. This approach speeds up the training process relative to training on all of the data. Moreover, time-biasing lets the models adapt to recent changes in the data while—unlike in a sliding-window approach—still keeping some old data to ensure robustness in the face of temporary fluctuations and periodicities in the data values. In addition, the sampling-based approach allows existing analytic algorithms for static data to be applied to dynamic streaming data essentially without change. We provide and analyze both a simple sampling scheme (Targeted-Size Time-Biased Sampling (T-TBS)) that probabilistically maintains a target sample size and a novel reservoir-based scheme (Reservoir-Based Time-Biased Sampling (R-TBS)) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. If the decay function is exponential, then control over the decay rate is complete, and R-TBS maximizes both expected sample size and sample-size stability. For general decay functions, the actual item inclusion probabilities can be made arbitrarily close to the nominal probabilities, and we provide a scheme that allows a tradeoff between sample footprint and sample-size stability. R-TBS rests on the notion of a “fractional sample” and allows for data arrival rates that are unknown and time varying (unlike T-TBS). The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark illuminate the performance and scalability of the algorithms, and show that our approach can increase machine learning robustness in the face of evolving data.}, keywords = {}, pubstate = {published}, tppubtype = {article} } To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying over time according to a specified “decay function.” We then periodically retrain the models on the current sample. This approach speeds up the training process relative to training on all of the data. Moreover, time-biasing lets the models adapt to recent changes in the data while—unlike in a sliding-window approach—still keeping some old data to ensure robustness in the face of temporary fluctuations and periodicities in the data values. In addition, the sampling-based approach allows existing analytic algorithms for static data to be applied to dynamic streaming data essentially without change. We provide and analyze both a simple sampling scheme (Targeted-Size Time-Biased Sampling (T-TBS)) that probabilistically maintains a target sample size and a novel reservoir-based scheme (Reservoir-Based Time-Biased Sampling (R-TBS)) that is the first to provide both control over the decay rate and a guaranteed upper bound on the sample size. If the decay function is exponential, then control over the decay rate is complete, and R-TBS maximizes both expected sample size and sample-size stability. For general decay functions, the actual item inclusion probabilities can be made arbitrarily close to the nominal probabilities, and we provide a scheme that allows a tradeoff between sample footprint and sample-size stability. R-TBS rests on the notion of a “fractional sample” and allows for data arrival rates that are unknown and time varying (unlike T-TBS). The R-TBS and T-TBS schemes are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark illuminate the performance and scalability of the algorithms, and show that our approach can increase machine learning robustness in the face of evolving data. |
Emily A Herbert, Wang Cen, Peter J Haas NIM: generative neural networks for modeling and generation of simulation inputs Inproceedings Proceedings of the 2019 Summer Simulation Conference, SummerSim 2019, Berlin, Germany, July 22-24, 2019, pp. 65:1–65:6, 2019. @inproceedings{DBLP:conf/scsc/HerbertCH19, title = {NIM: generative neural networks for modeling and generation of simulation inputs}, author = {Emily A Herbert and Wang Cen and Peter J Haas}, url = {https://dl.acm.org/citation.cfm?id=3374203 https://people.cs.umass.edu/~phaas/files/SSC2019.pdf}, year = {2019}, date = {2019-07-22}, booktitle = {Proceedings of the 2019 Summer Simulation Conference, SummerSim 2019, Berlin, Germany, July 22-24, 2019}, pages = {65:1--65:6}, crossref = {DBLP:conf/scsc/2019}, abstract = {We introduce Neural Input Modeling (NIM), a generative-neural-network framework that exploits modern data-rich environments to automatically capture complex simulation input distributions and then generate samples from them. Experiments show that our prototype architecture NIM-VL, which uses a variational autoencoder with LSTM components, can accurately, and with no prior knowledge, automatically capture a range of stochastic processes, including mixed-ARMA and nonhomogeneous Poisson processes, and can efficiently generate sample paths. Moreover, we show that the outputs from a queueing model with (known) complex inputs are statistically close to outputs from the same queueing model but with the inputs learned via NIM. Known distributional properties such as i.i.d. structure and nonnegativity can be exploited to increase accuracy and speed. NIM has the potential to help overcome one of the key barriers to simulation for non-experts.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We introduce Neural Input Modeling (NIM), a generative-neural-network framework that exploits modern data-rich environments to automatically capture complex simulation input distributions and then generate samples from them. Experiments show that our prototype architecture NIM-VL, which uses a variational autoencoder with LSTM components, can accurately, and with no prior knowledge, automatically capture a range of stochastic processes, including mixed-ARMA and nonhomogeneous Poisson processes, and can efficiently generate sample paths. Moreover, we show that the outputs from a queueing model with (known) complex inputs are statistically close to outputs from the same queueing model but with the inputs learned via NIM. Known distributional properties such as i.i.d. structure and nonnegativity can be exploited to increase accuracy and speed. NIM has the potential to help overcome one of the key barriers to simulation for non-experts. |
2018 |
Abolfazl Asudeh, H V Jagadish, Gerome Miklau, Julia Stoyanovich On Obtaining Stable Rankings Journal Article PVLDB, 12 (3), pp. 237–250, 2018. @article{DBLP:journals/pvldb/AsudehJMS18, title = {On Obtaining Stable Rankings}, author = {Abolfazl Asudeh and H V Jagadish and Gerome Miklau and Julia Stoyanovich}, url = {http://www.vldb.org/pvldb/vol12/p237-asudeh.pdf}, doi = {10.14778/3291264.3291269}, year = {2018}, date = {2018-01-01}, journal = {PVLDB}, volume = {12}, number = {3}, pages = {237--250}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Julia Stoyanovich, Bill Howe, H V Jagadish, Gerome Miklau Panel: A Debate on Data and Algorithmic Ethics Journal Article PVLDB, 11 (12), pp. 2165–2167, 2018. @article{DBLP:journals/pvldb/StoyanovichHJM18, title = {Panel: A Debate on Data and Algorithmic Ethics}, author = {Julia Stoyanovich and Bill Howe and H V Jagadish and Gerome Miklau}, url = {http://www.vldb.org/pvldb/vol11/p2165-stoyanovich.pdf}, doi = {10.14778/3229863.3240494}, year = {2018}, date = {2018-01-01}, journal = {PVLDB}, volume = {11}, number = {12}, pages = {2165--2167}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Sameera Ghayyur, Yan Chen, Roberto Yus, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau, Sharad Mehrotra IoT-Detective: Analyzing IoT Data Under Differential Privacy Inproceedings Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018, pp. 1725–1728, 2018. @inproceedings{DBLP:conf/sigmod/Ghayyur0YMHMM18, title = {IoT-Detective: Analyzing IoT Data Under Differential Privacy}, author = {Sameera Ghayyur and Yan Chen and Roberto Yus and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau and Sharad Mehrotra}, url = {https://doi.org/10.1145/3183713.3193571}, doi = {10.1145/3183713.3193571}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018}, pages = {1725--1728}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Ke Yang, Julia Stoyanovich, Abolfazl Asudeh, Bill Howe, H V Jagadish, Gerome Miklau A Nutritional Label for Rankings Inproceedings Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018, pp. 1773–1776, 2018. @inproceedings{DBLP:conf/sigmod/YangSAHJM18, title = {A Nutritional Label for Rankings}, author = {Ke Yang and Julia Stoyanovich and Abolfazl Asudeh and Bill Howe and H V Jagadish and Gerome Miklau}, url = {https://doi.org/10.1145/3183713.3193568}, doi = {10.1145/3183713.3193568}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018}, pages = {1773--1776}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Dan Zhang, Ryan McKenna, Ios Kotsogiannis, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau EKTELO: A Framework for Defining Differentially-Private Computations Inproceedings Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018, pp. 115–130, 2018. @inproceedings{DBLP:conf/sigmod/ZhangMKHMM18, title = {EKTELO: A Framework for Defining Differentially-Private Computations}, author = {Dan Zhang and Ryan McKenna and Ios Kotsogiannis and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau}, url = {https://doi.org/10.1145/3183713.3196921}, doi = {10.1145/3183713.3196921}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018}, pages = {115--130}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Brian Neil Levine, Gerome Miklau Auditing and Forensic Analysis Incollection Encyclopedia of Database Systems, Second Edition, 2018. @incollection{DBLP:reference/db/LevineM18, title = {Auditing and Forensic Analysis}, author = {Brian Neil Levine and Gerome Miklau}, url = {https://doi.org/10.1007/978-1-4614-8265-9_30}, doi = {10.1007/978-1-4614-8265-9_30}, year = {2018}, date = {2018-01-01}, booktitle = {Encyclopedia of Database Systems, Second Edition}, keywords = {}, pubstate = {published}, tppubtype = {incollection} } |
Enhui Huang, Liping Peng, Luciano Di Palma, Ahmed Abdelkafi, Anna Liu, Yanlei Diao Optimization for active learning-based interactive database exploration Journal Article Proceedings of the VLDB Endowment, 12 (1), pp. 71–84, 2018. @article{huang2018optimization, title = {Optimization for active learning-based interactive database exploration}, author = {Enhui Huang and Liping Peng and Luciano Di Palma and Ahmed Abdelkafi and Anna Liu and Yanlei Diao}, url = {http://www.vldb.org/pvldb/vol12/p71-huang.pdf}, year = {2018}, date = {2018-01-01}, journal = {Proceedings of the VLDB Endowment}, volume = {12}, number = {1}, pages = {71--84}, publisher = {VLDB Endowment}, abstract = {There is an increasing gap between fast growth of data and limited human ability to comprehend data. Consequently, there has been a growing demand of data management tools that can bridge this gap and help the user retrieve high-value content from data more effectively. In this work, we aim to build interactive data exploration as a new database service, using an approach called “explore-by-example”. In particular, we cast the explore-by-example problem in a principled “active learning” framework, and bring the properties of important classes of database queries to bear on the design of new algorithms and optimizations for active learning-based database exploration. These new techniques allow the database system to overcome a fundamental limitation of traditional active learning, i.e., the slow convergence problem. Evaluation results using real-world datasets and user interest patterns show that our new system significantly outperforms state-of-the-art active learning techniques and data exploration systems in accuracy while achieving desired efficiency for interactive performance.}, keywords = {}, pubstate = {published}, tppubtype = {article} } There is an increasing gap between fast growth of data and limited human ability to comprehend data. Consequently, there has been a growing demand of data management tools that can bridge this gap and help the user retrieve high-value content from data more effectively. In this work, we aim to build interactive data exploration as a new database service, using an approach called “explore-by-example”. In particular, we cast the explore-by-example problem in a principled “active learning” framework, and bring the properties of important classes of database queries to bear on the design of new algorithms and optimizations for active learning-based database exploration. These new techniques allow the database system to overcome a fundamental limitation of traditional active learning, i.e., the slow convergence problem. Evaluation results using real-world datasets and user interest patterns show that our new system significantly outperforms state-of-the-art active learning techniques and data exploration systems in accuracy while achieving desired efficiency for interactive performance. |
Ryan McKenna, Gerome Miklau, Michael Hay, Ashwin Machanavajjhala Optimizing error of high-dimensional statistical queries under differential privacy Journal Article PVLDB, 11 (10), pp. 1206–1219, 2018. @article{DBLP:journals/pvldb/McKennaMHM18, title = {Optimizing error of high-dimensional statistical queries under differential privacy}, author = {Ryan McKenna and Gerome Miklau and Michael Hay and Ashwin Machanavajjhala}, url = {http://www.vldb.org/pvldb/vol11/p1206-mckenna.pdf https://github.com/ryan112358/hdmm}, year = {2018}, date = {2018-01-01}, journal = {PVLDB}, volume = {11}, number = {10}, pages = {1206--1219}, abstract = {Differentially private algorithms for answering sets of predicate counting queries on a sensitive database have many applications. Organizations that collect individual-level data, such as statistical agencies and medical institutions, use them to safely release summary tabulations. However, existing techniques are accurate only on a narrow class of query workloads, or are extremely slow, especially when analyzing more than one or two dimensions of the data. In this work we propose HDMM, a new differentially private algorithm for answering a workload of predicate counting queries, that is especially effective for higher-dimensional datasets. HDMM represents query workloads using an implicit matrix representation and exploits this compact representation to efficiently search (a subset of) the space of differentially private algorithms for one that answers the input query workload with high accuracy. We empirically show that HDMM can efficiently answer queries with lower error than state-of-the-art techniques on a variety of low and high dimensional datasets.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Differentially private algorithms for answering sets of predicate counting queries on a sensitive database have many applications. Organizations that collect individual-level data, such as statistical agencies and medical institutions, use them to safely release summary tabulations. However, existing techniques are accurate only on a narrow class of query workloads, or are extremely slow, especially when analyzing more than one or two dimensions of the data. In this work we propose HDMM, a new differentially private algorithm for answering a workload of predicate counting queries, that is especially effective for higher-dimensional datasets. HDMM represents query workloads using an implicit matrix representation and exploits this compact representation to efficiently search (a subset of) the space of differentially private algorithms for one that answers the input query workload with high accuracy. We empirically show that HDMM can efficiently answer queries with lower error than state-of-the-art techniques on a variety of low and high dimensional datasets. |
Brian Hentschel, Peter J Haas, Yuanyuan Tian Temporally-Biased Sampling for Online Model Management Inproceedings Proceedings of the 21th International Conference on Extending Database Technology, EDBT 2018, Vienna, Austria, March 26-29, 2018., pp. 109–120, 2018. @inproceedings{DBLP:conf/edbt/HentschelHT18, title = {Temporally-Biased Sampling for Online Model Management}, author = {Brian Hentschel and Peter J Haas and Yuanyuan Tian}, url = {http://openproceedings.org/2018/conf/edbt/paper-52.pdf}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the 21th International Conference on Extending Database Technology, EDBT 2018, Vienna, Austria, March 26-29, 2018.}, pages = {109--120}, abstract = {To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally-biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying exponentially over time. We then periodically retrain the models on the current sample. This approach speeds up the training process relative to training on all of the data. Moreover, time-biasing lets the models adapt to recent changes in the data while—unlike in a sliding-window approach—still keeping some old data to ensure robustness in the face of temporary fluctuations and periodicities in the data values. In addition, the sampling-based approach allows existing analytic algorithms for static data to be applied to dynamic streaming data essentially without change. We provide and analyze both a simple sampling scheme (T-TBS) that probabilistically maintains a target sample size and a novel reservoir-based scheme (R-TBS) that is the first to provide both complete control over the decay rate and a guaranteed upper bound on the sample size, while maximizing both expected sample size and sample-size stability. The latter scheme rests on the notion of a “fractional sample” and, unlike T-TBS, allows for data arrival rates that are unknown and time varying. R-TBS and T-TBS are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark illuminate the performance and scalability of the algorithms, and show that our approach can increase machine learning robustness in the face of evolving data. }, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } To maintain the accuracy of supervised learning models in the presence of evolving data streams, we provide temporally-biased sampling schemes that weight recent data most heavily, with inclusion probabilities for a given data item decaying exponentially over time. We then periodically retrain the models on the current sample. This approach speeds up the training process relative to training on all of the data. Moreover, time-biasing lets the models adapt to recent changes in the data while—unlike in a sliding-window approach—still keeping some old data to ensure robustness in the face of temporary fluctuations and periodicities in the data values. In addition, the sampling-based approach allows existing analytic algorithms for static data to be applied to dynamic streaming data essentially without change. We provide and analyze both a simple sampling scheme (T-TBS) that probabilistically maintains a target sample size and a novel reservoir-based scheme (R-TBS) that is the first to provide both complete control over the decay rate and a guaranteed upper bound on the sample size, while maximizing both expected sample size and sample-size stability. The latter scheme rests on the notion of a “fractional sample” and, unlike T-TBS, allows for data arrival rates that are unknown and time varying. R-TBS and T-TBS are of independent interest, extending the known set of unequal-probability sampling schemes. We discuss distributed implementation strategies; experiments in Spark illuminate the performance and scalability of the algorithms, and show that our approach can increase machine learning robustness in the face of evolving data. |
Yuriy Brun, Alexandra Meliou Software Fairness Inproceedings Proceedings of the New Ideas and Emerging Results Track at the 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE), Lake Buena Vista, FL, USA, 2018. @inproceedings{Brun18fse-nier, title = {Software Fairness}, author = {Yuriy Brun and Alexandra Meliou}, url = {https://people.cs.umass.edu/~brun/pubs/pubs/Brun18fse-nier.pdf}, year = {2018}, date = {2018-11-06}, booktitle = {Proceedings of the New Ideas and Emerging Results Track at the 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)}, address = {Lake Buena Vista, FL, USA}, abstract = {A goal of software engineering research is advancing software quality and the success of the software engineering process. However, while recent studies have demonstrated a new kind of defect in software related to its ability to operate in fair and unbiased manner, software engineering has not yet wholeheartedly tackled these new kinds of defects, thus leaving software vulnerable. This paper outlines a vision for how software engineering research can help reduce fairness defects and represents a call to action by the software engineering research community to reify that vision. Modern software is riddled with examples of biased behavior, from automated translation injecting gender stereotypes, to vision systems failing to see faces of certain races, to the US criminal justice sytem relying on biased computational assessments of crime recidivism. While systems may learn bias from biased data, bias can also emerge from ambiguous or incomplete requirement specification, poor design, implementation bugs, and unintended component interactions. We argue that software fairness is analogous to software quality, and that numerous software engineering challenges in the areas of requirements, specification, design, testing, and verification need to be tackled to solve this problem.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } A goal of software engineering research is advancing software quality and the success of the software engineering process. However, while recent studies have demonstrated a new kind of defect in software related to its ability to operate in fair and unbiased manner, software engineering has not yet wholeheartedly tackled these new kinds of defects, thus leaving software vulnerable. This paper outlines a vision for how software engineering research can help reduce fairness defects and represents a call to action by the software engineering research community to reify that vision. Modern software is riddled with examples of biased behavior, from automated translation injecting gender stereotypes, to vision systems failing to see faces of certain races, to the US criminal justice sytem relying on biased computational assessments of crime recidivism. While systems may learn bias from biased data, bias can also emerge from ambiguous or incomplete requirement specification, poor design, implementation bugs, and unintended component interactions. We argue that software fairness is analogous to software quality, and that numerous software engineering challenges in the areas of requirements, specification, design, testing, and verification need to be tackled to solve this problem. |
Rico Angell, Brittany Johnson, Yuriy Brun, Alexandra Meliou Themis: Automatically Testing Software for Discrimination Inproceedings Proceedings of the Demonstrations Track at the The 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE), Lake Buena Vista, FL, USA, 2018, (demonstration paper). @inproceedings{Angell18fse-demo, title = {Themis: Automatically Testing Software for Discrimination}, author = {Rico Angell and Brittany Johnson and Yuriy Brun and Alexandra Meliou}, url = {https://people.cs.umass.edu/~brun/pubs/pubs/Angell18fse-demo.pdf http://fairness.cs.umass.edu/ https://youtu.be/brB8wkaUesY}, year = {2018}, date = {2018-11-06}, booktitle = {Proceedings of the Demonstrations Track at the The 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)}, address = {Lake Buena Vista, FL, USA}, abstract = {Bias in decisions made by modern software is becoming a common and serious problem. We present Themis, an automated test suite generator to measure two types of discrimination, including causal relationships between sensitive inputs and program behavior. We explain how Themis can measure discrimination and aid its debugging, describe a set of optimizations Themis uses to reduce test suite size, and demonstrate Themis' effectiveness on open-source software. Themis is open-source and all our evaluation data are available at http://fairness.cs.umass.edu/. See a video of Themis in action: https://youtu.be/brB8wkaUesY}, note = {demonstration paper}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Bias in decisions made by modern software is becoming a common and serious problem. We present Themis, an automated test suite generator to measure two types of discrimination, including causal relationships between sensitive inputs and program behavior. We explain how Themis can measure discrimination and aid its debugging, describe a set of optimizations Themis uses to reduce test suite size, and demonstrate Themis' effectiveness on open-source software. Themis is open-source and all our evaluation data are available at http://fairness.cs.umass.edu/. See a video of Themis in action: https://youtu.be/brB8wkaUesY |
Xiaolan Wang, Laura Haas, Alexandra Meliou Explaining Data Integration Journal Article IEEE Data Engineering Bulletin, 41 (2), pp. 47–58, 2018. @article{WangHM2018, title = {Explaining Data Integration}, author = {Xiaolan Wang and Laura Haas and Alexandra Meliou}, url = {http://sites.computer.org/debull/A18june/p47.pdf}, year = {2018}, date = {2018-01-01}, journal = {IEEE Data Engineering Bulletin}, volume = {41}, number = {2}, pages = {47--58}, abstract = {Explanations are an integral part of human behavior: people provide explanations to justify choices and actions, and seek explanations to understand the world around them. The need for explanations extends to technology, as semi-automated and fully-automated systems support crucial activities and increasingly important societal functions. The interpretability of these systems and the ability to explain their decision processes are crucial in developing trust in the systems’ function. Further, explanations provide opportunities for systems to interact with human users and obtain feedback, improving their operation. Finally, explanations allow domain experts and system developers to debug erroneous system decisions, diagnose unexpected outcomes, and improve system function. In this paper, we study and review existing data integration systems with respect to their ability to derive explanations. We present a new classification of data integration systems by their explainability and discuss the characteristics of systems within these classes. We review the types of explanations derived by the various data integration systems within each explainability class. Finally, we present a vision of the desired properties of future data integration systems with respect to explanations and discuss the challenges in pursuing this goal.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Explanations are an integral part of human behavior: people provide explanations to justify choices and actions, and seek explanations to understand the world around them. The need for explanations extends to technology, as semi-automated and fully-automated systems support crucial activities and increasingly important societal functions. The interpretability of these systems and the ability to explain their decision processes are crucial in developing trust in the systems’ function. Further, explanations provide opportunities for systems to interact with human users and obtain feedback, improving their operation. Finally, explanations allow domain experts and system developers to debug erroneous system decisions, diagnose unexpected outcomes, and improve system function. In this paper, we study and review existing data integration systems with respect to their ability to derive explanations. We present a new classification of data integration systems by their explainability and discuss the characteristics of systems within these classes. We review the types of explanations derived by the various data integration systems within each explainability class. Finally, we present a vision of the desired properties of future data integration systems with respect to explanations and discuss the challenges in pursuing this goal. |
Yue Wang, Alexandra Meliou, Gerome Miklau RC-Index: Diversifying Answers to Range Queries Journal Article PVLDB, 11 (7), pp. 773–786, 2018. @article{WangMM2018, title = {RC-Index: Diversifying Answers to Range Queries}, author = {Yue Wang and Alexandra Meliou and Gerome Miklau}, url = {https://people.cs.umass.edu/~ameli/projects/fairness/papers/p607-wang.pdf}, doi = {10.14778/3192965.3192969}, year = {2018}, date = {2018-01-01}, journal = {PVLDB}, volume = {11}, number = {7}, pages = {773--786}, abstract = {Query result diversification is widely used in data exploration, Web search, and recommendation systems. The problem of returning diversified query results consists of finding a small subset of valid query answers that are representative and different from one another, usually quantified by a diversity score. Most existing techniques for query diversification first compute all valid query results and then find a diverse subset. These techniques are inefficient when the set of valid query results is large. Other work has proposed efficient solutions for restricted application settings, where results are shared across multiple queries. In this paper, our goal is to support result diversification for general range queries over a single relation. We propose the RC-Index, a novel index structure that achieves efficiency by reducing the number of items that must be retrieved by the database to form a diverse set of the desired size (about 1 second for a dataset of 1 million items). Further, we prove that an RC-Index offers strong approximation guarantees. To the best of our knowledge, this is the first index-based diversification method with a guaranteed approximation ratio for range queries.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Query result diversification is widely used in data exploration, Web search, and recommendation systems. The problem of returning diversified query results consists of finding a small subset of valid query answers that are representative and different from one another, usually quantified by a diversity score. Most existing techniques for query diversification first compute all valid query results and then find a diverse subset. These techniques are inefficient when the set of valid query results is large. Other work has proposed efficient solutions for restricted application settings, where results are shared across multiple queries. In this paper, our goal is to support result diversification for general range queries over a single relation. We propose the RC-Index, a novel index structure that achieves efficiency by reducing the number of items that must be retrieved by the database to form a diverse set of the desired size (about 1 second for a dataset of 1 million items). Further, we prove that an RC-Index offers strong approximation guarantees. To the best of our knowledge, this is the first index-based diversification method with a guaranteed approximation ratio for range queries. |
Anna Fariha, Sheikh Muhammad Sarwar, Alexandra Meliou SQuID: Semantic Similarity-Aware Query Intent Discovery Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), pp. 1745–1748, 2018, (demonstration paper). @inproceedings{FarihaSM2018, title = {SQuID: Semantic Similarity-Aware Query Intent Discovery}, author = {Anna Fariha and Sheikh Muhammad Sarwar and Alexandra Meliou}, url = {https://people.cs.umass.edu/~ameli/projects/squid/papers/squid-demo.pdf}, doi = {10.1145/3183713.3193548}, year = {2018}, date = {2018-01-01}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, pages = {1745--1748}, abstract = {Recent expansion of database technology demands a convenient framework for non-expert users to explore datasets. Several approaches exist to assist these non-expert users where they can express their query intent by providing example tuples for their intended query output. However, these approaches treat the structural similarity among the example tuples as the only factor specifying query intent and ignore the richer context present in the data. In this demo, we present SQuID, a system for Semantic similarity aware Query Intent Discovery. SQuID takes a few example tuples from the user as input, through a simple interface, and consults the database to discover deeper associations among these examples. These data-driven associations reveal the semantic context of the provided examples, allowing SQuID to infer the user’s intended query precisely and effectively. SQuID further explains its inference, by displaying the discovered semantic context to the user, who can then provide feedback and tune the result. We demonstrate how SQuID can capture even esoteric and complex semantic contexts, alleviating the need for constructing complex SQL queries, while not requiring the user to have any schema or query language knowledge.}, note = {demonstration paper}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Recent expansion of database technology demands a convenient framework for non-expert users to explore datasets. Several approaches exist to assist these non-expert users where they can express their query intent by providing example tuples for their intended query output. However, these approaches treat the structural similarity among the example tuples as the only factor specifying query intent and ignore the richer context present in the data. In this demo, we present SQuID, a system for Semantic similarity aware Query Intent Discovery. SQuID takes a few example tuples from the user as input, through a simple interface, and consults the database to discover deeper associations among these examples. These data-driven associations reveal the semantic context of the provided examples, allowing SQuID to infer the user’s intended query precisely and effectively. SQuID further explains its inference, by displaying the discovered semantic context to the user, who can then provide feedback and tune the result. We demonstrate how SQuID can capture even esoteric and complex semantic contexts, alleviating the need for constructing complex SQL queries, while not requiring the user to have any schema or query language knowledge. |
Matteo Brucato, Azza Abouzied, Alexandra Meliou Package queries: efficient and scalable computation of high-order constraints Journal Article The VLDB Journal, 2018, ([Special Issue on Best Papers of VLDB 2016]). @article{BrucatoAM2018, title = {Package queries: efficient and scalable computation of high-order constraints}, author = {Matteo Brucato and Azza Abouzied and Alexandra Meliou}, doi = {10.1007/s00778-017-0483-4}, year = {2018}, date = {2018-01-01}, journal = {The VLDB Journal}, note = {[Special Issue on Best Papers of VLDB 2016]}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
2017 |
Garrett Bernstein, Ryan McKenna, Tao Sun, Daniel Sheldon, Michael Hay, Gerome Miklau Differentially Private Learning of Undirected Graphical Models Using Collective Graphical Models Inproceedings Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017, pp. 478–487, 2017. @inproceedings{DBLP:conf/icml/BernsteinMSSHM17, title = {Differentially Private Learning of Undirected Graphical Models Using Collective Graphical Models}, author = {Garrett Bernstein and Ryan McKenna and Tao Sun and Daniel Sheldon and Michael Hay and Gerome Miklau}, url = {http://proceedings.mlr.press/v70/bernstein17a.html}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017}, pages = {478--487}, crossref = {DBLP:conf/icml/2017}, abstract = {We investigate the problem of learning discrete graphical models in a differentially private way. Approaches to this problem range from privileged algorithms that conduct learning completely behind the privacy barrier to schemes that release private summary statistics paired with algorithms to learn parameters from those statistics. We show that the approach of releasing noisy sufficient statistics using the Laplace mechanism achieves a good trade-off between privacy, utility, and practicality. A naive learning algorithm that uses the noisy sufficient statistics “as is” outperforms general-purpose differentially private learning algorithms. However, it has three limitations: it ignores knowledge about the data generating process, rests on uncertain theoretical foundations, and exhibits certain pathologies. We develop a more principled approach that applies the formalism of collective graphical models to perform inference over the true sufficient statistics within an expectation-maximization framework. We show that this learns better models than competing approaches on both synthetic data and on real human mobility data used as a case study.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } We investigate the problem of learning discrete graphical models in a differentially private way. Approaches to this problem range from privileged algorithms that conduct learning completely behind the privacy barrier to schemes that release private summary statistics paired with algorithms to learn parameters from those statistics. We show that the approach of releasing noisy sufficient statistics using the Laplace mechanism achieves a good trade-off between privacy, utility, and practicality. A naive learning algorithm that uses the noisy sufficient statistics “as is” outperforms general-purpose differentially private learning algorithms. However, it has three limitations: it ignores knowledge about the data generating process, rests on uncertain theoretical foundations, and exhibits certain pathologies. We develop a more principled approach that applies the formalism of collective graphical models to perform inference over the true sufficient statistics within an expectation-maximization framework. We show that this learns better models than competing approaches on both synthetic data and on real human mobility data used as a case study. |
Haopeng Zhang, Yanlei Diao, Alexandra Meliou EXstream: Explaining Anomalies in Event Stream Monitoring Inproceedings Proceedings of the 20th International Conference on Extending Database Technology, EDBT 2017, Venice, Italy, March 21-24, 2017., pp. 156–167, 2017. @inproceedings{DBLP:conf/edbt/ZhangDM17, title = {EXstream: Explaining Anomalies in Event Stream Monitoring}, author = {Haopeng Zhang and Yanlei Diao and Alexandra Meliou}, url = {https://people.cs.umass.edu/~ameli/papers/EDBT2017.pdf}, doi = {10.5441/002/edbt.2017.15}, year = {2017}, date = {2017-03-21}, booktitle = {Proceedings of the 20th International Conference on Extending Database Technology, EDBT 2017, Venice, Italy, March 21-24, 2017.}, pages = {156--167}, crossref = {DBLP:conf/edbt/2017}, abstract = {In this paper, we present the EXstream system that provides high-quality explanations for anomalous behaviors that users annotate on CEP-based monitoring results. Given the new requirements for explanations, namely, conciseness, consistency with human interpretation, and prediction power, most existing techniques cannot produce explanations that satisfy all three of them. The key technical contributions of this work include a formal definition of optimally explaining anomalies in CEP monitoring, and three key techniques for generating sufficient feature space, characterizing the contribution of each feature to the explanation, and selecting a small subset of features as the optimal explanation, respectively. Evaluation using two real-world use cases shows that EXstream can outperform existing techniques significantly in conciseness and consistency while achieving comparable high prediction power and retaining a highly efficient implementation of a data stream system.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } In this paper, we present the EXstream system that provides high-quality explanations for anomalous behaviors that users annotate on CEP-based monitoring results. Given the new requirements for explanations, namely, conciseness, consistency with human interpretation, and prediction power, most existing techniques cannot produce explanations that satisfy all three of them. The key technical contributions of this work include a formal definition of optimally explaining anomalies in CEP monitoring, and three key techniques for generating sufficient feature space, characterizing the contribution of each feature to the explanation, and selecting a small subset of features as the optimal explanation, respectively. Evaluation using two real-world use cases shows that EXstream can outperform existing techniques significantly in conciseness and consistency while achieving comparable high prediction power and retaining a highly efficient implementation of a data stream system. |
Xiaolan Wang, Alexandra Meliou, Eugene Wu QFix: Diagnosing errors through query histories Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), pp. 1369–1384, 2017. @inproceedings{WangMW2017, title = {QFix: Diagnosing errors through query histories}, author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu}, url = {http://people.cs.umass.edu/ameli/projects/queryProvenance/papers/WangMW2017.pdf}, doi = {10.1145/2882903.2899388}, year = {2017}, date = {2017-05-14}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD)}, pages = {1369--1384}, abstract = {Data-driven applications rely on the correctness of their data to function properly and effectively. Errors in data can be incredibly costly and disruptive, leading to loss of revenue, incorrect conclusions, and misguided policy decisions. While data cleaning tools can purge datasets of many errors before the data is used, applications and users interacting with the data can introduce new errors. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Even when some of these discrepancies are discovered, they are often corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this paper, we propose QFix, a framework that derives explanations and repairs for discrepancies in relational data, by analyzing the effect of queries that operated on the data and identifying potential mistakes in those queries. QFix is flexible, handling scenarios where only a subset of the true discrepancies is known, and robust to different types of update workloads. We make four important contributions: (a) we formalize the problem of diagnosing the causes of data errors based on the queries that operated on and introduced errors to a dataset; (b) we develop exact methods for deriving diagnoses and fixes for identified errors using state-of-the-art tools; (c) we present several optimization techniques that improve our basic approach without compromising accuracy, and (d) we leverage a tradeoff between accuracy and performance to scale diagnosis to large datasets and query logs, while achieving near-optimal results. We demonstrate the effectiveness of QFix through extensive evaluation over benchmark and synthetic data.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Data-driven applications rely on the correctness of their data to function properly and effectively. Errors in data can be incredibly costly and disruptive, leading to loss of revenue, incorrect conclusions, and misguided policy decisions. While data cleaning tools can purge datasets of many errors before the data is used, applications and users interacting with the data can introduce new errors. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Even when some of these discrepancies are discovered, they are often corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this paper, we propose QFix, a framework that derives explanations and repairs for discrepancies in relational data, by analyzing the effect of queries that operated on the data and identifying potential mistakes in those queries. QFix is flexible, handling scenarios where only a subset of the true discrepancies is known, and robust to different types of update workloads. We make four important contributions: (a) we formalize the problem of diagnosing the causes of data errors based on the queries that operated on and introduced errors to a dataset; (b) we develop exact methods for deriving diagnoses and fixes for identified errors using state-of-the-art tools; (c) we present several optimization techniques that improve our basic approach without compromising accuracy, and (d) we leverage a tradeoff between accuracy and performance to scale diagnosis to large datasets and query logs, while achieving near-optimal results. We demonstrate the effectiveness of QFix through extensive evaluation over benchmark and synthetic data. |
Matteo Brucato, Azza Abouzied, Alexandra Meliou A Scalable Execution Engine for Package Queries Journal Article SIGMOD Record, 46 (1), pp. 24–31, 2017, ISSN: 0163-5808, (ACM SIGMOD Research Highlight Award). @article{BrucatoAM2017, title = {A Scalable Execution Engine for Package Queries}, author = {Matteo Brucato and Azza Abouzied and Alexandra Meliou}, url = {https://sigmodrecord.org/publications/sigmodRecord/1703/pdfs/08_ASalable_RH_Brucato.pdf}, doi = {10.1145/3093754.3093761}, issn = {0163-5808}, year = {2017}, date = {2017-03-15}, journal = {SIGMOD Record}, volume = {46}, number = {1}, pages = {24--31}, publisher = {ACM}, address = {New York, NY, USA}, abstract = {Many modern applications and real-world problems involve the design of item collections, or packages: from planning your daily meals all the way to mapping the universe. Despite the pervasive need for packages, traditional data management does not offer support for their definition and computation. This is because traditional database queries follow a powerful, but very simple model: a query defines constraints that each tuple in the result must satisfy. However, a system tasked with the design of packages cannot consider items independently; rather, the system needs to determine if a set of items collectively satisfy given criteria. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. Second, we present a fundamental strategy for evaluating package queries that combines the capabilities of databases and constraint optimization solvers. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SKETCHREFINE, an efficient and scalable algorithm for package evaluation, which offers strong approximation guarantees. Finally, we present extensive experiments over real-world data. Our results demonstrate that SKETCHREFINE is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets.}, note = {ACM SIGMOD Research Highlight Award}, keywords = {}, pubstate = {published}, tppubtype = {article} } Many modern applications and real-world problems involve the design of item collections, or packages: from planning your daily meals all the way to mapping the universe. Despite the pervasive need for packages, traditional data management does not offer support for their definition and computation. This is because traditional database queries follow a powerful, but very simple model: a query defines constraints that each tuple in the result must satisfy. However, a system tasked with the design of packages cannot consider items independently; rather, the system needs to determine if a set of items collectively satisfy given criteria. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. Second, we present a fundamental strategy for evaluating package queries that combines the capabilities of databases and constraint optimization solvers. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SKETCHREFINE, an efficient and scalable algorithm for package evaluation, which offers strong approximation guarantees. Finally, we present extensive experiments over real-world data. Our results demonstrate that SKETCHREFINE is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets. |
Sainyam Galhotra, Yuriy Brun, Alexandra Meliou Fairness Testing: Testing Software for Discrimination Inproceedings Proceedings of 2017 11th Joint Meeting of the European Software Engineering Conference and the ACM SIGSOFT Symposium on the Foundations of Software Engineering, pp. 498–510, 2017, (ACM SIGSOFT Distinguished Paper Award). @inproceedings{GalhotraBM2017, title = {Fairness Testing: Testing Software for Discrimination}, author = {Sainyam Galhotra and Yuriy Brun and Alexandra Meliou}, url = {http://people.cs.umass.edu/~brun/pubs/pubs/Galhotra17fse.pdf}, doi = {10.1145/3106237.3106277}, year = {2017}, date = {2017-09-06}, booktitle = {Proceedings of 2017 11th Joint Meeting of the European Software Engineering Conference and the ACM SIGSOFT Symposium on the Foundations of Software Engineering}, pages = {498--510}, abstract = {This paper defines the notions of software fairness and discrimination and develops a testing-based method for measuring if and how much software discriminates. Specifically, the paper focuses on measuring causality in discriminatory behavior. Modern software contributes to important societal decisions and evidence of software discrimination has been found in systems that recommend criminal sentences, grant access to financial loans and products, and determine who is allowed to participate in promotions and receive services. Our approach, Themis, measures discrimination in software by generating efficient, discrimination-testing test suites. Given a schema describing valid system inputs, Themis generates discrimination tests automatically and, notably, does not require an oracle. We evaluate Themis on 20 software systems, 12 of which come from prior work with explicit focus on avoiding discrimination. We find that (1) Themis is effective at discovering software discrimination, (2) state-of-the-art techniques for removing discrimination from algorithms fail in many situations, at times discriminating against as much as 98% of an input subdomain, (3) Themis optimizations are effective at producing efficient test suites for measuring discrimination, and (4) Themis is more efficient on systems that exhibit more discrimination. We thus demonstrate that fairness testing is a critical aspect of the software development cycle in domains with possible discrimination and provide initial tools for measuring software discrimination.}, note = {ACM SIGSOFT Distinguished Paper Award}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper defines the notions of software fairness and discrimination and develops a testing-based method for measuring if and how much software discriminates. Specifically, the paper focuses on measuring causality in discriminatory behavior. Modern software contributes to important societal decisions and evidence of software discrimination has been found in systems that recommend criminal sentences, grant access to financial loans and products, and determine who is allowed to participate in promotions and receive services. Our approach, Themis, measures discrimination in software by generating efficient, discrimination-testing test suites. Given a schema describing valid system inputs, Themis generates discrimination tests automatically and, notably, does not require an oracle. We evaluate Themis on 20 software systems, 12 of which come from prior work with explicit focus on avoiding discrimination. We find that (1) Themis is effective at discovering software discrimination, (2) state-of-the-art techniques for removing discrimination from algorithms fail in many situations, at times discriminating against as much as 98% of an input subdomain, (3) Themis optimizations are effective at producing efficient test suites for measuring discrimination, and (4) Themis is more efficient on systems that exhibit more discrimination. We thus demonstrate that fairness testing is a critical aspect of the software development cycle in domains with possible discrimination and provide initial tools for measuring software discrimination. |
Michael Hay, Liudmila Elagina, Gerome Miklau Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017., pp. 669–677, 2017. @inproceedings{DBLP:conf/sdm/HayEM17, title = {Differentially Private Rank Aggregation}, author = {Michael Hay and Liudmila Elagina and Gerome Miklau}, url = {https://doi.org/10.1137/1.9781611974973.75}, doi = {10.1137/1.9781611974973.75}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017.}, pages = {669--677}, crossref = {DBLP:conf/sdm/2017}, abstract = {Given a collection of rankings of a set of items, rank aggregation seeks to compute a ranking that can serve as a single best representative of the collection. Rank aggregation is a well-studied problem and a number of effective algorithmic solutions have been proposed in the literature. However, when individuals are asked to contribute a ranking, they may be concerned that their personal preferences will be disclosed inappropriately to others. This acts as a disincentive to individuals to respond honestly in expressing their preferences and impedes data collection and data sharing. We address this problem by investigating rank aggregation under differential privacy, which requires that a released output (here, the aggregate ranking computed from individuals' rankings) remain almost the same if any one individual's ranking is removed from the input. We propose a number of differentially-private rank aggregation algorithms: two are inspired by non-private approximate rank aggregators from the existing literature; another uses a novel rejection sampling method to sample privately from a complex distribution. For all the methods we propose, we quantify, both theoretically and empirically, the “cost” of privacy in terms of the quality of the rank aggregation computed.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Given a collection of rankings of a set of items, rank aggregation seeks to compute a ranking that can serve as a single best representative of the collection. Rank aggregation is a well-studied problem and a number of effective algorithmic solutions have been proposed in the literature. However, when individuals are asked to contribute a ranking, they may be concerned that their personal preferences will be disclosed inappropriately to others. This acts as a disincentive to individuals to respond honestly in expressing their preferences and impedes data collection and data sharing. We address this problem by investigating rank aggregation under differential privacy, which requires that a released output (here, the aggregate ranking computed from individuals' rankings) remain almost the same if any one individual's ranking is removed from the input. We propose a number of differentially-private rank aggregation algorithms: two are inspired by non-private approximate rank aggregators from the existing literature; another uses a novel rejection sampling method to sample privately from a complex distribution. For all the methods we propose, we quantify, both theoretically and empirically, the “cost” of privacy in terms of the quality of the rank aggregation computed. |
Ios Kotsogiannis, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau Pythia: Data Dependent Differentially Private Algorithm Selection Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1323–1337, 2017. @inproceedings{DBLP:conf/sigmod/KotsogiannisMHM17, title = {Pythia: Data Dependent Differentially Private Algorithm Selection}, author = {Ios Kotsogiannis and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau}, url = {http://doi.acm.org/10.1145/3035918.3035945}, doi = {10.1145/3035918.3035945}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017}, pages = {1323--1337}, crossref = {DBLP:conf/sigmod/2017}, abstract = {Differential privacy has emerged as a preferred standard for ensuring privacy in analysis tasks on sensitive datasets. Recent algorithms have allowed for significantly lower error by adapting to properties of the input data. These so-called data-dependent algorithms have different error rates for different inputs. There is now a complex and growing landscape of algorithms without a clear winner that can offer low error over all datasets. As a result, the best possible error rates are not attainable in practice, because the data curator cannot know which algorithm to select prior to actually running the algorithm. We address this challenge by proposing a novel meta-algorithm designed to relieve the data curator of the burden of algorithm selection. It works by learning (from non-sensitive data) the association between dataset properties and the best-performing algorithm. The meta-algorithm is deployed by first testing the input for low-sensitivity properties and then using the results to select a good algorithm. The result is an end-to-end differentially private system: Pythia, which we show offers improvements over using any single algorithm alone. We empirically demonstrate the benefit of Pythia for the tasks of releasing histograms, answering 1- and 2-dimensional range queries, as well as for constructing private Naive Bayes classifiers.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Differential privacy has emerged as a preferred standard for ensuring privacy in analysis tasks on sensitive datasets. Recent algorithms have allowed for significantly lower error by adapting to properties of the input data. These so-called data-dependent algorithms have different error rates for different inputs. There is now a complex and growing landscape of algorithms without a clear winner that can offer low error over all datasets. As a result, the best possible error rates are not attainable in practice, because the data curator cannot know which algorithm to select prior to actually running the algorithm. We address this challenge by proposing a novel meta-algorithm designed to relieve the data curator of the burden of algorithm selection. It works by learning (from non-sensitive data) the association between dataset properties and the best-performing algorithm. The meta-algorithm is deployed by first testing the input for low-sensitivity properties and then using the results to select a good algorithm. The result is an end-to-end differentially private system: Pythia, which we show offers improvements over using any single algorithm alone. We empirically demonstrate the benefit of Pythia for the tasks of releasing histograms, answering 1- and 2-dimensional range queries, as well as for constructing private Naive Bayes classifiers. |
Ios Kotsogiannis, Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Margaret Orr DIAS: Differentially Private Interactive Algorithm Selection using Pythia Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1679–1682, 2017. @inproceedings{DBLP:conf/sigmod/KotsogiannisHMM17, title = {DIAS: Differentially Private Interactive Algorithm Selection using Pythia}, author = {Ios Kotsogiannis and Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Margaret Orr}, url = {http://doi.acm.org/10.1145/3035918.3056441}, doi = {10.1145/3035918.3056441}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017}, pages = {1679--1682}, crossref = {DBLP:conf/sigmod/2017}, abstract = {Differential privacy has emerged as the dominant privacy standard for data analysis. Its wide acceptance has led to significant development of algorithms that meet this rigorous standard. For some tasks, such as the task of answering low dimensional counting queries, dozens of algorithms have been proposed. However, no single algorithm has emerged as the dominant performer, and in fact, algorithm performance varies drastically across inputs. Thus, it's not clear how to select an algorithm for a particular task, and choosing the wrong algorithm might lead to significant degradation in terms of analysis accuracy. We believe that the difficulty of algorithm selection is one factor limiting the adoption of differential privacy in real systems. In this demonstration we present DIAS (Differentially-private Interactive Algorithm Selection), an educational privacy game. Users are asked to perform algorithm selection for a variety of inputs and compare the performance of their choices against that of Pythia, an automated algorithm selection framework. Our hope is that by the end of the game users will understand the importance of algorithm selection and most importantly will have a good grasp on how to use differentially private algorithms for their own applications.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Differential privacy has emerged as the dominant privacy standard for data analysis. Its wide acceptance has led to significant development of algorithms that meet this rigorous standard. For some tasks, such as the task of answering low dimensional counting queries, dozens of algorithms have been proposed. However, no single algorithm has emerged as the dominant performer, and in fact, algorithm performance varies drastically across inputs. Thus, it's not clear how to select an algorithm for a particular task, and choosing the wrong algorithm might lead to significant degradation in terms of analysis accuracy. We believe that the difficulty of algorithm selection is one factor limiting the adoption of differential privacy in real systems. In this demonstration we present DIAS (Differentially-private Interactive Algorithm Selection), an educational privacy game. Users are asked to perform algorithm selection for a variety of inputs and compare the performance of their choices against that of Pythia, an automated algorithm selection framework. Our hope is that by the end of the game users will understand the importance of algorithm selection and most importantly will have a good grasp on how to use differentially private algorithms for their own applications. |
Garrett Bernstein, Ryan McKenna, Tao Sun, Daniel Sheldon, Michael Hay, Gerome Miklau Differentially Private Learning of Undirected Graphical Models using Collective Graphical Models Journal Article CoRR, abs/1706.04646 , 2017. @article{DBLP:journals/corr/BernsteinMSSHM17, title = {Differentially Private Learning of Undirected Graphical Models using Collective Graphical Models}, author = {Garrett Bernstein and Ryan McKenna and Tao Sun and Daniel Sheldon and Michael Hay and Gerome Miklau}, url = {http://arxiv.org/abs/1706.04646}, year = {2017}, date = {2017-01-01}, journal = {CoRR}, volume = {abs/1706.04646}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Julia Stoyanovich, Bill Howe, Serge Abiteboul, Gerome Miklau, Arnaud Sahuguet, Gerhard Weikum Fides: Towards a Platform for Responsible Data Science Inproceedings Proceedings of the 29th International Conference on Scientific and Statistical Database Management, Chicago, IL, USA, June 27-29, 2017, pp. 26:1–26:6, 2017. @inproceedings{DBLP:conf/ssdbm/StoyanovichHAMS17, title = {Fides: Towards a Platform for Responsible Data Science}, author = {Julia Stoyanovich and Bill Howe and Serge Abiteboul and Gerome Miklau and Arnaud Sahuguet and Gerhard Weikum}, url = {http://doi.acm.org/10.1145/3085504.3085530}, doi = {10.1145/3085504.3085530}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 29th International Conference on Scientific and Statistical Database Management, Chicago, IL, USA, June 27-29, 2017}, pages = {26:1--26:6}, crossref = {DBLP:conf/ssdbm/2017}, abstract = {Issues of responsible data analysis and use are coming to the forefront of the discourse in data science research and practice, with most significant efforts to date on the part of the data mining, machine learning, and security and privacy communities. In these fields, the research has been focused on analyzing the fairness, accountability and transparency (FAT) properties of specific algorithms and their outputs. Although these issues are most apparent in the social sciences where fairness is interpreted in terms of the distribution of resources across protected groups, management of bias in source data affects a variety of fields. Consider climate change studies that require representative data from geographically diverse regions, or supply chain analyses that require data that represents the diversity of products and customers. Any domain that involves sparse or sampled data has exposure to potential bias. In this vision paper, we argue that FAT properties must be considered as database system issues, further upstream in the data science lifecycle: bias in source data goes unnoticed, and bias may be introduced during pre-processing (fairness), spurious correlations lead to reproducibility problems (accountability), and assumptions made during pre-processing have invisible but significant effects on decisions (transparency). As machine learning methods continue to be applied broadly by non-experts, the potential for misuse increases. We see a need for a data sharing and collaborative analytics platform with features to encourage (and in some cases, enforce) best practices at all stages of the data science lifecycle. We describe features of such a platform, which we term Fides, in the context of urban analytics, outlining a systems research agenda in responsible data science.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Issues of responsible data analysis and use are coming to the forefront of the discourse in data science research and practice, with most significant efforts to date on the part of the data mining, machine learning, and security and privacy communities. In these fields, the research has been focused on analyzing the fairness, accountability and transparency (FAT) properties of specific algorithms and their outputs. Although these issues are most apparent in the social sciences where fairness is interpreted in terms of the distribution of resources across protected groups, management of bias in source data affects a variety of fields. Consider climate change studies that require representative data from geographically diverse regions, or supply chain analyses that require data that represents the diversity of products and customers. Any domain that involves sparse or sampled data has exposure to potential bias. In this vision paper, we argue that FAT properties must be considered as database system issues, further upstream in the data science lifecycle: bias in source data goes unnoticed, and bias may be introduced during pre-processing (fairness), spurious correlations lead to reproducibility problems (accountability), and assumptions made during pre-processing have invisible but significant effects on decisions (transparency). As machine learning methods continue to be applied broadly by non-experts, the potential for misuse increases. We see a need for a data sharing and collaborative analytics platform with features to encourage (and in some cases, enforce) best practices at all stages of the data science lifecycle. We describe features of such a platform, which we term Fides, in the context of urban analytics, outlining a systems research agenda in responsible data science. |
Abhishek Roy, Yanlei Diao, Uday Evani, Avinash Abhyankar, Clinton Howarth, Rémi Le Priol, Toby Bloom Massively Parallel Processing of Whole Genome Sequence Data: An In-Depth Performance Study Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 187–202, 2017. @inproceedings{DBLP:conf/sigmod/RoyDEAHPB17, title = {Massively Parallel Processing of Whole Genome Sequence Data: An In-Depth Performance Study}, author = {Abhishek Roy and Yanlei Diao and Uday Evani and Avinash Abhyankar and Clinton Howarth and R{é}mi Le Priol and Toby Bloom}, url = {http://doi.acm.org/10.1145/3035918.3064048}, doi = {10.1145/3035918.3064048}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017}, pages = {187--202}, crossref = {DBLP:conf/sigmod/2017}, abstract = {This paper presents a joint effort between a group of computer scientists and bioinformaticians to take an important step towards a general big data platform for genome analysis pipelines. The key goals of this study are to develop a thorough understanding of the strengths and limitations of big data technology for genomic data analysis, and to identify the key questions that the research community could address to realize the vision of personalized genomic medicine. Our platform, called Gesall, is based on the new "Wrapper Technology" that supports existing genomic data analysis programs in their native forms, without having to rewrite them. To do so, our system provides several layers of software, including a new Genome Data Parallel Toolkit (GDPT), which can be used to "wrap" existing data analysis programs. This platform offers a concrete context for evaluating big data technology for genomics: we report on super-linear speedup and sublinear speedup for various tasks, as well as the reasons why a parallel program could produce different results from those of a serial program. These results lead to key research questions that require a synergy between genomics scientists and computer scientists to find solutions.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } This paper presents a joint effort between a group of computer scientists and bioinformaticians to take an important step towards a general big data platform for genome analysis pipelines. The key goals of this study are to develop a thorough understanding of the strengths and limitations of big data technology for genomic data analysis, and to identify the key questions that the research community could address to realize the vision of personalized genomic medicine. Our platform, called Gesall, is based on the new "Wrapper Technology" that supports existing genomic data analysis programs in their native forms, without having to rewrite them. To do so, our system provides several layers of software, including a new Genome Data Parallel Toolkit (GDPT), which can be used to "wrap" existing data analysis programs. This platform offers a concrete context for evaluating big data technology for genomics: we report on super-linear speedup and sublinear speedup for various tasks, as well as the reasons why a parallel program could produce different results from those of a serial program. These results lead to key research questions that require a synergy between genomics scientists and computer scientists to find solutions. |
Ahmed Elgohary, Matthias Boehm, Peter J Haas, Frederick R Reiss, Berthold Reinwald Compressed linear algebra for large-scale machine learning Journal Article The VLDB Journal, 2017, ISSN: 0949-877X. @article{Elgohary2017, title = {Compressed linear algebra for large-scale machine learning}, author = {Ahmed Elgohary and Matthias Boehm and Peter J Haas and Frederick R Reiss and Berthold Reinwald}, url = {http://www.vldb.org/pvldb/vol9/p960-elgohary.pdf https://doi.org/10.1007/s00778-017-0478-1}, doi = {10.1007/s00778-017-0478-1}, issn = {0949-877X}, year = {2017}, date = {2017-09-12}, journal = {The VLDB Journal}, abstract = {Large-scale machine learning algorithms are often iterative, using repeated read-only data access and I/O-bound matrix-vector multiplications to converge to an optimal model. It is crucial for performance to fit the data into single-node or distributed main memory and enable fast matrix-vector operations on in-memory data. General-purpose, heavy- and lightweight compression techniques struggle to achieve both good compression ratios and fast decompression speed to enable block-wise uncompressed operations. Therefore, we initiate work---inspired by database compression and sparse matrix formats---on value-based compressed linear algebra (CLA), in which heterogeneous, lightweight database compression techniques are applied to matrices, and then linear algebra operations such as matrix-vector multiplication are executed directly on the compressed representation. We contribute effective column compression schemes, cache-conscious operations, and an efficient sampling-based compression algorithm. Our experiments show that CLA achieves in-memory operations performance close to the uncompressed case and good compression ratios, which enables fitting substantially larger datasets into available memory. We thereby obtain significant end-to-end performance improvements up to 9.2x.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Large-scale machine learning algorithms are often iterative, using repeated read-only data access and I/O-bound matrix-vector multiplications to converge to an optimal model. It is crucial for performance to fit the data into single-node or distributed main memory and enable fast matrix-vector operations on in-memory data. General-purpose, heavy- and lightweight compression techniques struggle to achieve both good compression ratios and fast decompression speed to enable block-wise uncompressed operations. Therefore, we initiate work---inspired by database compression and sparse matrix formats---on value-based compressed linear algebra (CLA), in which heterogeneous, lightweight database compression techniques are applied to matrices, and then linear algebra operations such as matrix-vector multiplication are executed directly on the compressed representation. We contribute effective column compression schemes, cache-conscious operations, and an efficient sampling-based compression algorithm. Our experiments show that CLA achieves in-memory operations performance close to the uncompressed case and good compression ratios, which enables fitting substantially larger datasets into available memory. We thereby obtain significant end-to-end performance improvements up to 9.2x. |
Michael Hay, Liudmila Elagina, Gerome Miklau Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017, pp. 669–677, 2017. @inproceedings{DBLP:conf/sdm/HayEM17b, title = {Differentially Private Rank Aggregation}, author = {Michael Hay and Liudmila Elagina and Gerome Miklau}, url = {https://doi.org/10.1137/1.9781611974973.75}, doi = {10.1137/1.9781611974973.75}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017}, pages = {669--677}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Yan Chen, Ashwin Machanavajjhala, Michael Hay, Gerome Miklau PeGaSus: Data-Adaptive Differentially Private Stream Processing Inproceedings Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS 2017, Dallas, TX, USA, October 30 - November 03, 2017, pp. 1375–1388, 2017. @inproceedings{DBLP:conf/ccs/ChenMHM17, title = {PeGaSus: Data-Adaptive Differentially Private Stream Processing}, author = {Yan Chen and Ashwin Machanavajjhala and Michael Hay and Gerome Miklau}, url = {https://doi.org/10.1145/3133956.3134102}, doi = {10.1145/3133956.3134102}, year = {2017}, date = {2017-01-01}, booktitle = {Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS 2017, Dallas, TX, USA, October 30 - November 03, 2017}, pages = {1375--1388}, crossref = {DBLP:conf/ccs/2017}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Chao Li, Daniel Yang Li, Gerome Miklau, Dan Suciu A theory of pricing private data Journal Article Commun. ACM, 60 (12), pp. 79–86, 2017. @article{DBLP:journals/cacm/LiLMS17, title = {A theory of pricing private data}, author = {Chao Li and Daniel Yang Li and Gerome Miklau and Dan Suciu}, url = {https://doi.org/10.1145/3139457}, doi = {10.1145/3139457}, year = {2017}, date = {2017-01-01}, journal = {Commun. ACM}, volume = {60}, number = {12}, pages = {79--86}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Matteo Brucato, Azza Abouzied, Chris Blauvelt Redistributing Funds across Charitable Crowdfunding Campaigns Journal Article CoRR, abs/1706.00070 , 2017. @article{DBLP:journals/corr/BrucatoAB17, title = {Redistributing Funds across Charitable Crowdfunding Campaigns}, author = {Matteo Brucato and Azza Abouzied and Chris Blauvelt}, url = {http://arxiv.org/abs/1706.00070}, year = {2017}, date = {2017-01-01}, journal = {CoRR}, volume = {abs/1706.00070}, abstract = {On Kickstarter only 36% of crowdfunding campaigns successfully raise sufficient funds for their projects. In this paper, we explore the possibility of redistribution of crowdfunding donations to increase the chances of success. We define several intuitive redistribution policies and, using data from a real crowdfunding platform, LaunchGood, we assess the potential improvement in campaign fundraising success rates. We find that an aggressive redistribution scheme can boost campaign success rates from 37% to 79%, but such choice-agnostic redistribution schemes come at the cost of disregarding donor preferences. Taking inspiration from offline giving societies and donor clubs, we build a case for choice-preserving redistribution schemes that strike a balance between increasing the number of successful campaigns and respecting giving preference. We find that choice-preserving redistribution can easily achieve campaign success rates of 48%. Finally, we discuss the implications of these different redistribution schemes for the various stakeholders in the crowdfunding ecosystem.}, keywords = {}, pubstate = {published}, tppubtype = {article} } On Kickstarter only 36% of crowdfunding campaigns successfully raise sufficient funds for their projects. In this paper, we explore the possibility of redistribution of crowdfunding donations to increase the chances of success. We define several intuitive redistribution policies and, using data from a real crowdfunding platform, LaunchGood, we assess the potential improvement in campaign fundraising success rates. We find that an aggressive redistribution scheme can boost campaign success rates from 37% to 79%, but such choice-agnostic redistribution schemes come at the cost of disregarding donor preferences. Taking inspiration from offline giving societies and donor clubs, we build a case for choice-preserving redistribution schemes that strike a balance between increasing the number of successful campaigns and respecting giving preference. We find that choice-preserving redistribution can easily achieve campaign success rates of 48%. Finally, we discuss the implications of these different redistribution schemes for the various stakeholders in the crowdfunding ecosystem. |
2016 |
Yanlei Diao, Michael J. Franklin High-Performance XML Message Brokering Incollection Data Stream Management - Processing High-Speed Data Streams, pp. 451–471, 2016. @incollection{DBLP:books/sp/16/DiaoF16, title = {High-Performance XML Message Brokering}, author = {Yanlei Diao and Michael J. Franklin}, url = {http://dx.doi.org/10.1007/978-3-540-28608-0_22}, doi = {10.1007/978-3-540-28608-0_22}, year = {2016}, date = {2016-01-01}, booktitle = {Data Stream Management - Processing High-Speed Data Streams}, pages = {451--471}, crossref = {DBLP:books/sp/GGR2016}, abstract = {For distributed environments including Web Services, data and application integration, and personalized content delivery, XML is becoming the common wire format for data. In this emerging distributed infrastructure, XML message brokers will play a key role as central exchange points for messages sent between applications and/or users. Users (equivalently, applications, or organizations) subscribe to the message broker by providing profiles expressing their data interests. After arriving at the message broker, these profiles become “standing queries,” which are executed on all incoming data. Data sources publish their data by pushing streams of XML messages to the broker. The broker delivers to each user the messages that match his data interests; these messages are presented in the required format of the user. We have developed “YFilter”, an XML filtering system aimed at providing efficient filtering for large numbers (e.g., 10’s or 100’s of thousands) of path queries. The key innovation in YFilter is a Nondeterministic Finite Automaton (NFA)-based representation of path expressions which combines all queries into a single machine. YFilter exploits commonality among path queries by merging the common prefixes of the paths so that they are processed at most once. The NFA-based implementation also provides additional benefits including a relatively small machine size, flexibility in dealing with diverse characteristics of data and queries, incremental machine construction, and ease of maintenance. This work has been supported in part by the National Science Foundation under the ITR grants IIS0086057 and SI0122599 and by Boeing, IBM, Intel, Microsoft, Siemens, and the UC MICRO program.}, keywords = {}, pubstate = {published}, tppubtype = {incollection} } For distributed environments including Web Services, data and application integration, and personalized content delivery, XML is becoming the common wire format for data. In this emerging distributed infrastructure, XML message brokers will play a key role as central exchange points for messages sent between applications and/or users. Users (equivalently, applications, or organizations) subscribe to the message broker by providing profiles expressing their data interests. After arriving at the message broker, these profiles become “standing queries,” which are executed on all incoming data. Data sources publish their data by pushing streams of XML messages to the broker. The broker delivers to each user the messages that match his data interests; these messages are presented in the required format of the user. We have developed “YFilter”, an XML filtering system aimed at providing efficient filtering for large numbers (e.g., 10’s or 100’s of thousands) of path queries. The key innovation in YFilter is a Nondeterministic Finite Automaton (NFA)-based representation of path expressions which combines all queries into a single machine. YFilter exploits commonality among path queries by merging the common prefixes of the paths so that they are processed at most once. The NFA-based implementation also provides additional benefits including a relatively small machine size, flexibility in dealing with diverse characteristics of data and queries, incremental machine construction, and ease of maintenance. This work has been supported in part by the National Science Foundation under the ITR grants IIS0086057 and SI0122599 and by Boeing, IBM, Intel, Microsoft, Siemens, and the UC MICRO program. |
Yue Wang, Alexandra Meliou, Gerome Miklau A Consumer-Centric Market for Database Computation in the Cloud Journal Article CoRR, abs/1609.02104 , 2016. @article{DBLP:journals/corr/WangMM16, title = {A Consumer-Centric Market for Database Computation in the Cloud}, author = {Yue Wang and Alexandra Meliou and Gerome Miklau}, url = {http://arxiv.org/abs/1609.02104}, year = {2016}, date = {2016-01-01}, journal = {CoRR}, volume = {abs/1609.02104}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Xiaolan Wang, Alexandra Meliou, Eugene Wu QFix: Diagnosing errors through query histories Journal Article CoRR, abs/1601.07539 , 2016. @article{DBLP:journals/corr/WangMW16, title = {QFix: Diagnosing errors through query histories}, author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu}, url = {http://arxiv.org/abs/1601.07539}, year = {2016}, date = {2016-01-01}, journal = {CoRR}, volume = {abs/1601.07539}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Xiaolan Wang, Alexandra Meliou, Eugene Wu QFix: Demonstrating Error Diagnosis in Query Histories Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2177–2180, 2016. @inproceedings{DBLP:conf/sigmod/WangM016, title = {QFix: Demonstrating Error Diagnosis in Query Histories}, author = {Xiaolan Wang and Alexandra Meliou and Eugene Wu}, url = {http://doi.acm.org/10.1145/2882903.2899388}, doi = {10.1145/2882903.2899388}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016}, pages = {2177--2180}, crossref = {DBLP:conf/sigmod/2016}, abstract = {An increasing number of applications in all aspects of society rely on data. Despite the long line of research in data cleaning and repairs, data correctness has been an elusive goal. Errors in the data can be extremely disruptive, and are detrimental to the effectiveness and proper function of data-driven applications. Even when data is cleaned, new errors can be introduced by applications and users who interact with the data. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Any discovered errors tend to be corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this demo proposal, we outline the design of QFix, a query-centric framework that derives explanations and repairs for discrepancies in relational data based on potential errors in the queries that operated on the data. This is a marked departure from traditional data-centric techniques that directly fix the data. We then describe how users will use QFix in a demonstration scenario. Participants will be able to select from a number of transactional benchmarks, introduce errors into the queries that are executed, and compare the fixes to the queries proposed by QFix as well as existing alternative algorithms such as decision trees.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } An increasing number of applications in all aspects of society rely on data. Despite the long line of research in data cleaning and repairs, data correctness has been an elusive goal. Errors in the data can be extremely disruptive, and are detrimental to the effectiveness and proper function of data-driven applications. Even when data is cleaned, new errors can be introduced by applications and users who interact with the data. Subsequent valid updates can obscure these errors and propagate them through the dataset causing more discrepancies. Any discovered errors tend to be corrected superficially, on a case-by-case basis, further obscuring the true underlying cause, and making detection of the remaining errors harder. In this demo proposal, we outline the design of QFix, a query-centric framework that derives explanations and repairs for discrepancies in relational data based on potential errors in the queries that operated on the data. This is a marked departure from traditional data-centric techniques that directly fix the data. We then describe how users will use QFix in a demonstration scenario. Participants will be able to select from a number of transactional benchmarks, introduce errors into the queries that are executed, and compare the fixes to the queries proposed by QFix as well as existing alternative algorithms such as decision trees. |
Matteo Brucato, Juan Felipe Beltran, Azza Abouzied, Alexandra Meliou Scalable Package Queries in Relational Database Systems Journal Article PVLDB, 9 (7), pp. 576–587, 2016, (Best of VLDB 2016). @article{DBLP:journals/pvldb/BrucatoBAM16, title = {Scalable Package Queries in Relational Database Systems}, author = {Matteo Brucato and Juan Felipe Beltran and Azza Abouzied and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol9/p576-brucato.pdf}, year = {2016}, date = {2016-01-01}, journal = {PVLDB}, volume = {9}, number = {7}, pages = {576--587}, abstract = {Traditional database queries follow a simple model: they define constraints that each tuple in the result must satisfy. This model is computationally efficient, as the database system can evaluate the query conditions on each tuple individually. However, many practical, real-world problems require a collection of result tuples to satisfy constraints collectively, rather than individually. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. We prove that PaQL is at least as expressive as integer linear programming, and therefore, evaluation of package queries is in general NP-hard. Second, we present a fundamental evaluation strategy that combines the capabilities of databases and constraint optimization solvers to derive solutions to package queries. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SketchRefine, a scalable algorithm for package evaluation, with strong approximation guarantees ((1±ε)^6-factor approximation). Finally, we present extensive experiments over real-world and benchmark data. The results demonstrate that SketchRefine is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets.}, note = {Best of VLDB 2016}, keywords = {}, pubstate = {published}, tppubtype = {article} } Traditional database queries follow a simple model: they define constraints that each tuple in the result must satisfy. This model is computationally efficient, as the database system can evaluate the query conditions on each tuple individually. However, many practical, real-world problems require a collection of result tuples to satisfy constraints collectively, rather than individually. In this paper, we present package queries, a new query model that extends traditional database queries to handle complex constraints and preferences over answer sets. We develop a full-fledged package query system, implemented on top of a traditional database engine. Our work makes several contributions. First, we design PaQL, a SQL-based query language that supports the declarative specification of package queries. We prove that PaQL is at least as expressive as integer linear programming, and therefore, evaluation of package queries is in general NP-hard. Second, we present a fundamental evaluation strategy that combines the capabilities of databases and constraint optimization solvers to derive solutions to package queries. The core of our approach is a set of translation rules that transform a package query to an integer linear program. Third, we introduce an offline data partitioning strategy allowing query evaluation to scale to large data sizes. Fourth, we introduce SketchRefine, a scalable algorithm for package evaluation, with strong approximation guarantees ((1±ε)^6-factor approximation). Finally, we present extensive experiments over real-world and benchmark data. The results demonstrate that SketchRefine is effective at deriving high-quality package results, and achieves runtime performance that is an order of magnitude faster than directly using ILP solvers over large datasets. |
Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Yan Chen, Dan Zhang, George Bissias Exploring Privacy-Accuracy Tradeoffs using DPComp Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2101–2104, 2016. @inproceedings{DBLP:conf/sigmod/HayMMCZB16, title = {Exploring Privacy-Accuracy Tradeoffs using DPComp}, author = {Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Yan Chen and Dan Zhang and George Bissias}, url = {http://doi.acm.org/10.1145/2882903.2899387}, doi = {10.1145/2882903.2899387}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016}, pages = {2101--2104}, crossref = {DBLP:conf/sigmod/2016}, abstract = {The emergence of differential privacy as a primary standard for privacy protection has led to the development, by the research community, of hundreds of algorithms for various data analysis tasks. Yet deployment of these techniques has been slowed by the complexity of algorithms and an incomplete understanding of the cost to accuracy implied by the adoption of differential privacy. In this demonstration we present DPComp, a publicly-accessible web-based system, designed to support a broad community of users, including data analysts, privacy researchers, and data owners. Users can use DPComp to assess the accuracy of state-of-the-art privacy algorithms and interactively explore algorithm output in order to understand, both quantitatively and qualitatively, the error introduced by the algorithms. In addition, users can contribute new algorithms and new (non-sensitive) datasets. DPComp automatically incorporates user contributions into an evolving benchmark based on a rigorous evaluation methodology articulated by Hay et al. }, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } The emergence of differential privacy as a primary standard for privacy protection has led to the development, by the research community, of hundreds of algorithms for various data analysis tasks. Yet deployment of these techniques has been slowed by the complexity of algorithms and an incomplete understanding of the cost to accuracy implied by the adoption of differential privacy. In this demonstration we present DPComp, a publicly-accessible web-based system, designed to support a broad community of users, including data analysts, privacy researchers, and data owners. Users can use DPComp to assess the accuracy of state-of-the-art privacy algorithms and interactively explore algorithm output in order to understand, both quantitatively and qualitatively, the error introduced by the algorithms. In addition, users can contribute new algorithms and new (non-sensitive) datasets. DPComp automatically incorporates user contributions into an evolving benchmark based on a rigorous evaluation methodology articulated by Hay et al. |
Michael Hay, Ashwin Machanavajjhala, Gerome Miklau, Yan Chen, Dan Zhang Principled Evaluation of Differentially Private Algorithms using DPBench Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 139–154, 2016. @inproceedings{DBLP:conf/sigmod/HayMMCZ16, title = {Principled Evaluation of Differentially Private Algorithms using DPBench}, author = {Michael Hay and Ashwin Machanavajjhala and Gerome Miklau and Yan Chen and Dan Zhang}, url = {http://doi.acm.org/10.1145/2882903.2882931}, doi = {10.1145/2882903.2882931}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016}, pages = {139--154}, crossref = {DBLP:conf/sigmod/2016}, abstract = {Differential privacy has become the dominant standard in the research community for strong privacy protection. There has been a flood of research into query answering algorithms that meet this standard. Algorithms are becoming increasingly complex, and in particular, the performance of many emerging algorithms is data dependent, meaning the distribution of the noise added to query answers may change depending on the input data. Theoretical analysis typically only considers the worst case, making empirical study of average case performance increasingly important. In this paper we propose a set of evaluation principles which we argue are essential for sound evaluation. Based on these principles we propose DPBench, a novel evaluation framework for standardized evaluation of privacy algorithms. We then apply our benchmark to evaluate algorithms for answering 1- and 2-dimensional range queries. The result is a thorough empirical study of 15 published algorithms on a total of 27 datasets that offers new insights into algorithm behavior---in particular the influence of dataset scale and shape---and a more complete characterization of the state of the art. Our methodology is able to resolve inconsistencies in prior empirical studies and place algorithm performance in context through comparison to simple baselines. Finally, we pose open research questions which we hope will guide future algorithm design.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Differential privacy has become the dominant standard in the research community for strong privacy protection. There has been a flood of research into query answering algorithms that meet this standard. Algorithms are becoming increasingly complex, and in particular, the performance of many emerging algorithms is data dependent, meaning the distribution of the noise added to query answers may change depending on the input data. Theoretical analysis typically only considers the worst case, making empirical study of average case performance increasingly important. In this paper we propose a set of evaluation principles which we argue are essential for sound evaluation. Based on these principles we propose DPBench, a novel evaluation framework for standardized evaluation of privacy algorithms. We then apply our benchmark to evaluate algorithms for answering 1- and 2-dimensional range queries. The result is a thorough empirical study of 15 published algorithms on a total of 27 datasets that offers new insights into algorithm behavior---in particular the influence of dataset scale and shape---and a more complete characterization of the state of the art. Our methodology is able to resolve inconsistencies in prior empirical studies and place algorithm performance in context through comparison to simple baselines. Finally, we pose open research questions which we hope will guide future algorithm design. |
Julia Stoyanovich, Serge Abiteboul, Gerome Miklau Data Responsibly: Fairness, Neutrality and Transparency in Data Analysis Inproceedings Proceedings of the 19th International Conference on Extending Database Technology, EDBT 2016, Bordeaux, France, March 15-16, 2016, Bordeaux, France, March 15-16, 2016., pp. 718–719, 2016. @inproceedings{DBLP:conf/edbt/StoyanovichAM16, title = {Data Responsibly: Fairness, Neutrality and Transparency in Data Analysis}, author = {Julia Stoyanovich and Serge Abiteboul and Gerome Miklau}, url = {http://dx.doi.org/10.5441/002/edbt.2016.103}, doi = {10.5441/002/edbt.2016.103}, year = {2016}, date = {2016-01-01}, booktitle = {Proceedings of the 19th International Conference on Extending Database Technology, EDBT 2016, Bordeaux, France, March 15-16, 2016, Bordeaux, France, March 15-16, 2016.}, pages = {718--719}, crossref = {DBLP:conf/edbt/2016}, abstract = {Big data technology holds incredible promise of improving people’s lives, accelerating scientific discovery and innovation, and bringing about positive societal change. Yet, if not used responsibly, this technology can propel economic inequality, destabilize global markets and affirm systemic bias. While the potential benefits of big data are well-accepted, the importance of using these techniques in a fair and transparent manner is rarely considered. The primary goal of this tutorial is to draw the attention of the data management community to the important emerging subject of responsible data management and analysis. We will offer our perspective on the issue, will give an overview of existing technical work, primarily from the data mining and algorithms communities, and will motivate future research directions.}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } Big data technology holds incredible promise of improving people’s lives, accelerating scientific discovery and innovation, and bringing about positive societal change. Yet, if not used responsibly, this technology can propel economic inequality, destabilize global markets and affirm systemic bias. While the potential benefits of big data are well-accepted, the importance of using these techniques in a fair and transparent manner is rarely considered. The primary goal of this tutorial is to draw the attention of the data management community to the important emerging subject of responsible data management and analysis. We will offer our perspective on the issue, will give an overview of existing technical work, primarily from the data mining and algorithms communities, and will motivate future research directions. |
Kyriaki Dimitriadou, Olga Papaemmanouil, Yanlei Diao AIDE: An Active Learning-Based Approach for Interactive Data Exploration Journal Article IEEE Trans. Knowl. Data Eng., 28 (11), pp. 2842–2856, 2016. @article{DBLP:journals/tkde/DimitriadouPD16, title = {AIDE: An Active Learning-Based Approach for Interactive Data Exploration}, author = {Kyriaki Dimitriadou and Olga Papaemmanouil and Yanlei Diao}, url = {http://dx.doi.org/10.1109/TKDE.2016.2599168}, doi = {10.1109/TKDE.2016.2599168}, year = {2016}, date = {2016-01-01}, journal = {IEEE Trans. Knowl. Data Eng.}, volume = {28}, number = {11}, pages = {2842--2856}, abstract = {In this paper, we argue that database systems be augmented with an automated data exploration service that methodically steers users through the data in a meaningful way. Such an automated system is crucial for deriving insights from complex datasets found in many big data applications such as scientific and healthcare applications as well as for reducing the human effort of data exploration. Towards this end, we present AIDE, an Automatic Interactive Data Exploration framework that assists users in discovering new interesting data patterns and eliminate expensive ad-hoc exploratory queries. AIDE relies on a seamless integration of classification algorithms and data management optimization techniques that collectively strive to accurately learn the user interests based on his relevance feedback on strategically collected samples. We present a number of exploration techniques as well as optimizations that minimize the number of samples presented to the user while offering interactive performance. AIDE can deliver highly accurate query predictions for very common conjunctive queries with small user effort while, given a reasonable number of samples, it can predict with high accuracy complex disjunctive queries. It provides interactive performance as it limits the user wait time per iteration of exploration to less than a few seconds.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In this paper, we argue that database systems be augmented with an automated data exploration service that methodically steers users through the data in a meaningful way. Such an automated system is crucial for deriving insights from complex datasets found in many big data applications such as scientific and healthcare applications as well as for reducing the human effort of data exploration. Towards this end, we present AIDE, an Automatic Interactive Data Exploration framework that assists users in discovering new interesting data patterns and eliminate expensive ad-hoc exploratory queries. AIDE relies on a seamless integration of classification algorithms and data management optimization techniques that collectively strive to accurately learn the user interests based on his relevance feedback on strategically collected samples. We present a number of exploration techniques as well as optimizations that minimize the number of samples presented to the user while offering interactive performance. AIDE can deliver highly accurate query predictions for very common conjunctive queries with small user effort while, given a reasonable number of samples, it can predict with high accuracy complex disjunctive queries. It provides interactive performance as it limits the user wait time per iteration of exploration to less than a few seconds. |
Yue Wang, Alexandra Meliou, Gerome Miklau Lifting the Haze off the Cloud: A Consumer-Centric Market for Database Computation in the Cloud Journal Article PVLDB, 10 (4), pp. 373–384, 2016. @article{DBLP:journals/pvldb/WangMM16, title = {Lifting the Haze off the Cloud: A Consumer-Centric Market for Database Computation in the Cloud}, author = {Yue Wang and Alexandra Meliou and Gerome Miklau}, url = {http://www.vldb.org/pvldb/vol10/p373-wang.pdf}, year = {2016}, date = {2016-01-01}, journal = {PVLDB}, volume = {10}, number = {4}, pages = {373--384}, abstract = {The availability of public computing resources in the cloud has revolutionized data analysis, but requesting cloud resources often involves complex decisions for consumers. Estimating the completion time and cost of a computation and requesting the appropriate cloud resources are challenging tasks even for an expert user. We propose a new market-based framework for pricing computational tasks in the cloud. Our framework introduces an agent between consumers and cloud providers. The agent takes data and computational tasks from users, estimates time and cost for evaluating the tasks, and returns to consumers contracts that specify the price and completion time. Our framework can be applied directly to existing cloud markets without altering the way cloud providers offer and price services. In addition, it simplifies cloud use for consumers by allowing them to compare contracts, rather than choose resources directly. We present design, analytical, and algorithmic contributions focusing on pricing computation contracts, analyzing their properties, and optimizing them in complex workflows. We conduct an experimental evaluation of our market framework over a real-world cloud service and demonstrate empirically that our market ensures three key properties: (a) that consumers benefit from using the market due to competitiveness among agents, (b) that agents have an incentive to price contracts fairly, and (c) that inaccuracies in estimates do not pose a significant risk to agents' profits. Finally, we present a fine-grained pricing mechanism for complex workflows and show that it can increase agent profits by more than an order of magnitude in some cases.}, keywords = {}, pubstate = {published}, tppubtype = {article} } The availability of public computing resources in the cloud has revolutionized data analysis, but requesting cloud resources often involves complex decisions for consumers. Estimating the completion time and cost of a computation and requesting the appropriate cloud resources are challenging tasks even for an expert user. We propose a new market-based framework for pricing computational tasks in the cloud. Our framework introduces an agent between consumers and cloud providers. The agent takes data and computational tasks from users, estimates time and cost for evaluating the tasks, and returns to consumers contracts that specify the price and completion time. Our framework can be applied directly to existing cloud markets without altering the way cloud providers offer and price services. In addition, it simplifies cloud use for consumers by allowing them to compare contracts, rather than choose resources directly. We present design, analytical, and algorithmic contributions focusing on pricing computation contracts, analyzing their properties, and optimizing them in complex workflows. We conduct an experimental evaluation of our market framework over a real-world cloud service and demonstrate empirically that our market ensures three key properties: (a) that consumers benefit from using the market due to competitiveness among agents, (b) that agents have an incentive to price contracts fairly, and (c) that inaccuracies in estimates do not pose a significant risk to agents' profits. Finally, we present a fine-grained pricing mechanism for complex workflows and show that it can increase agent profits by more than an order of magnitude in some cases. |
Serge Abiteboul, Gerome Miklau, Julia Stoyanovich, Gerhard Weikum Data, Responsibly (Dagstuhl Seminar 16291) Journal Article Dagstuhl Reports, 6 (7), pp. 42–71, 2016. @article{DBLP:journals/dagstuhl-reports/AbiteboulMSW16, title = {Data, Responsibly (Dagstuhl Seminar 16291)}, author = {Serge Abiteboul and Gerome Miklau and Julia Stoyanovich and Gerhard Weikum}, url = {http://dx.doi.org/10.4230/DagRep.6.7.42}, doi = {10.4230/DagRep.6.7.42}, year = {2016}, date = {2016-01-01}, journal = {Dagstuhl Reports}, volume = {6}, number = {7}, pages = {42--71}, abstract = {Big data technology promises to improve people's lives, accelerate scientific discovery and innovation, and bring about positive societal change. Yet, if not used responsibly, large-scale data analysis and data-driven algorithmic decision-making can increase economic inequality, affirm systemic bias, and even destabilize global markets. While the potential benefits of data analysis techniques are well accepted, the importance of using them responsibly - that is, in accordance with ethical and moral norms, and with legal and policy considerations - is not yet part of the mainstream research agenda in computer science. Dagstuhl Seminar "Data, Responsibly" brought together academic and industry researchers from several areas of computer science, including a broad representation of data management, but also data mining, security/privacy, and computer networks, as well as social sciences researchers, data journalists, and those active in government think-tanks and policy initiatives. The goals of the seminar were to assess the state of data analysis in terms of fairness, transparency and diversity, identify new research challenges, and derive an agenda for computer science research and education efforts in responsible data analysis and use. While the topic of the seminar is transdisciplinary in nature, an important goal of the seminar was to identify opportunities for high-impact contributions to this important emergent area specifically from the data management community.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Big data technology promises to improve people's lives, accelerate scientific discovery and innovation, and bring about positive societal change. Yet, if not used responsibly, large-scale data analysis and data-driven algorithmic decision-making can increase economic inequality, affirm systemic bias, and even destabilize global markets. While the potential benefits of data analysis techniques are well accepted, the importance of using them responsibly - that is, in accordance with ethical and moral norms, and with legal and policy considerations - is not yet part of the mainstream research agenda in computer science. Dagstuhl Seminar "Data, Responsibly" brought together academic and industry researchers from several areas of computer science, including a broad representation of data management, but also data mining, security/privacy, and computer networks, as well as social sciences researchers, data journalists, and those active in government think-tanks and policy initiatives. The goals of the seminar were to assess the state of data analysis in terms of fairness, transparency and diversity, identify new research challenges, and derive an agenda for computer science research and education efforts in responsible data analysis and use. While the topic of the seminar is transdisciplinary in nature, an important goal of the seminar was to identify opportunities for high-impact contributions to this important emergent area specifically from the data management community. |
Olga Papaemmanouil, Yanlei Diao, Kyriaki Dimitriadou, Liping Peng Interactive Data Exploration via Machine Learning Models Journal Article IEEE Data Eng. Bull., 39 (4), pp. 38–49, 2016. @article{DBLP:journals/debu/PapaemmanouilDD16, title = {Interactive Data Exploration via Machine Learning Models}, author = {Olga Papaemmanouil and Yanlei Diao and Kyriaki Dimitriadou and Liping Peng}, url = {http://sites.computer.org/debull/A16dec/p38.pdf}, year = {2016}, date = {2016-01-01}, journal = {IEEE Data Eng. Bull.}, volume = {39}, number = {4}, pages = {38--49}, abstract = {This article provides an overview of our research on data exploration. Our work aims to facilitate interactive exploration tasks in many big data applications in the scientific, biomedical and healthcare domains. We argue for a shift towards learning-based exploration techniques that automatically steer the user towards interesting data areas based on relevance feedback on database samples, aiming to achieve the goal of identifying all database objects that match the user interest with high efficiency. Our research realizes machine learning theory in the new setting of interactive data exploration and develops new optimizations to support “automated” data exploration with high performance over large databases. In this paper, we discuss a suite of techniques that draw insights from machine learning algorithms to guide the exploration of a big data space and leverage the knowledge of exploration patterns to optimize query processing inside the database.}, keywords = {}, pubstate = {published}, tppubtype = {article} } This article provides an overview of our research on data exploration. Our work aims to facilitate interactive exploration tasks in many big data applications in the scientific, biomedical and healthcare domains. We argue for a shift towards learning-based exploration techniques that automatically steer the user towards interesting data areas based on relevance feedback on database samples, aiming to achieve the goal of identifying all database objects that match the user interest with high efficiency. Our research realizes machine learning theory in the new setting of interactive data exploration and develops new optimizations to support “automated” data exploration with high performance over large databases. In this paper, we discuss a suite of techniques that draw insights from machine learning algorithms to guide the exploration of a big data space and leverage the knowledge of exploration patterns to optimize query processing inside the database. |
2015 |
Yanlei Diao, Kyriaki Dimitriadou, Zhan Li, Wenzhao Liu, Olga Papaemmanouil, Kemi Peng, Liping Peng AIDE: An Automatic User Navigation System for Interactive Data Exploration Journal Article PVLDB, 8 (12), pp. 1964–1975, 2015. @article{DBLP:journals/pvldb/DiaoDLLPPP15, title = {AIDE: An Automatic User Navigation System for Interactive Data Exploration}, author = {Yanlei Diao and Kyriaki Dimitriadou and Zhan Li and Wenzhao Liu and Olga Papaemmanouil and Kemi Peng and Liping Peng}, url = {http://www.vldb.org/pvldb/vol8/p1964-Diao.pdf}, year = {2015}, date = {2015-01-01}, journal = {PVLDB}, volume = {8}, number = {12}, pages = {1964--1975}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Li, Chao, Miklau, Gerome Lower Bounds on the Error of Query Sets Under the Differentially-Private Matrix Mechanism Journal Article Theory of Computing Systems, pp. 1-43, 2015, ISSN: 1432-4350. @article{, title = {Lower Bounds on the Error of Query Sets Under the Differentially-Private Matrix Mechanism}, author = {Li, Chao and Miklau, Gerome}, url = {http://dx.doi.org/10.1007/s00224-015-9610-z}, doi = {10.1007/s00224-015-9610-z}, issn = {1432-4350}, year = {2015}, date = {2015-01-01}, journal = {Theory of Computing Systems}, pages = {1-43}, publisher = {Springer US}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Xiaolan Wang, Mary Feng, Yue Wang, Xin Luna Dong, Alexandra Meliou Error Diagnosis and Data Profiling with Data X-Ray Journal Article PVLDB, 8 (12), pp. 1984–1995, 2015. @article{DBLP:journals/pvldb/WangFWDM15, title = {Error Diagnosis and Data Profiling with Data X-Ray}, author = {Xiaolan Wang and Mary Feng and Yue Wang and Xin Luna Dong and Alexandra Meliou}, url = {http://www.vldb.org/pvldb/vol8/p1984-wang.pdf}, year = {2015}, date = {2015-01-01}, journal = {PVLDB}, volume = {8}, number = {12}, pages = {1984--1995}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Ravali Pochampally, Anish Das Sarma, Xin Luna Dong, Alexandra Meliou, Divesh Srivastava Fusing Data with Correlations Journal Article CoRR, abs/1503.00306 , 2015. @article{DBLP:journals/corr/PochampallySDMS15, title = {Fusing Data with Correlations}, author = {Ravali Pochampally and Anish Das Sarma and Xin Luna Dong and Alexandra Meliou and Divesh Srivastava}, url = {http://arxiv.org/abs/1503.00306}, year = {2015}, date = {2015-01-01}, journal = {CoRR}, volume = {abs/1503.00306}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
2019 |
Architecting a Differentially Private SQL Engine Inproceedings CIDR 2019, 9th Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 13-16, 2019, Online Proceedings, 2019. |
General Temporally Biased Sampling Schemes for Online Model Management Journal Article ACM Trans. Database Syst., 44 (4), pp. 14:1–14:45, 2019. |
NIM: generative neural networks for modeling and generation of simulation inputs Inproceedings Proceedings of the 2019 Summer Simulation Conference, SummerSim 2019, Berlin, Germany, July 22-24, 2019, pp. 65:1–65:6, 2019. |
2018 |
On Obtaining Stable Rankings Journal Article PVLDB, 12 (3), pp. 237–250, 2018. |
Panel: A Debate on Data and Algorithmic Ethics Journal Article PVLDB, 11 (12), pp. 2165–2167, 2018. |
IoT-Detective: Analyzing IoT Data Under Differential Privacy Inproceedings Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018, pp. 1725–1728, 2018. |
A Nutritional Label for Rankings Inproceedings Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018, pp. 1773–1776, 2018. |
EKTELO: A Framework for Defining Differentially-Private Computations Inproceedings Proceedings of the 2018 International Conference on Management of Data, SIGMOD Conference 2018, Houston, TX, USA, June 10-15, 2018, pp. 115–130, 2018. |
Auditing and Forensic Analysis Incollection Encyclopedia of Database Systems, Second Edition, 2018. |
Optimization for active learning-based interactive database exploration Journal Article Proceedings of the VLDB Endowment, 12 (1), pp. 71–84, 2018. |
Optimizing error of high-dimensional statistical queries under differential privacy Journal Article PVLDB, 11 (10), pp. 1206–1219, 2018. |
Temporally-Biased Sampling for Online Model Management Inproceedings Proceedings of the 21th International Conference on Extending Database Technology, EDBT 2018, Vienna, Austria, March 26-29, 2018., pp. 109–120, 2018. |
Software Fairness Inproceedings Proceedings of the New Ideas and Emerging Results Track at the 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE), Lake Buena Vista, FL, USA, 2018. |
Themis: Automatically Testing Software for Discrimination Inproceedings Proceedings of the Demonstrations Track at the The 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE), Lake Buena Vista, FL, USA, 2018, (demonstration paper). |
Explaining Data Integration Journal Article IEEE Data Engineering Bulletin, 41 (2), pp. 47–58, 2018. |
RC-Index: Diversifying Answers to Range Queries Journal Article PVLDB, 11 (7), pp. 773–786, 2018. |
SQuID: Semantic Similarity-Aware Query Intent Discovery Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), pp. 1745–1748, 2018, (demonstration paper). |
Package queries: efficient and scalable computation of high-order constraints Journal Article The VLDB Journal, 2018, ([Special Issue on Best Papers of VLDB 2016]). |
2017 |
Differentially Private Learning of Undirected Graphical Models Using Collective Graphical Models Inproceedings Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017, pp. 478–487, 2017. |
EXstream: Explaining Anomalies in Event Stream Monitoring Inproceedings Proceedings of the 20th International Conference on Extending Database Technology, EDBT 2017, Venice, Italy, March 21-24, 2017., pp. 156–167, 2017. |
QFix: Diagnosing errors through query histories Inproceedings Proceedings of the ACM SIGMOD International Conference on Management of Data (SIGMOD), pp. 1369–1384, 2017. |
A Scalable Execution Engine for Package Queries Journal Article SIGMOD Record, 46 (1), pp. 24–31, 2017, ISSN: 0163-5808, (ACM SIGMOD Research Highlight Award). |
Fairness Testing: Testing Software for Discrimination Inproceedings Proceedings of 2017 11th Joint Meeting of the European Software Engineering Conference and the ACM SIGSOFT Symposium on the Foundations of Software Engineering, pp. 498–510, 2017, (ACM SIGSOFT Distinguished Paper Award). |
Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017., pp. 669–677, 2017. |
Pythia: Data Dependent Differentially Private Algorithm Selection Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1323–1337, 2017. |
DIAS: Differentially Private Interactive Algorithm Selection using Pythia Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1679–1682, 2017. |
Differentially Private Learning of Undirected Graphical Models using Collective Graphical Models Journal Article CoRR, abs/1706.04646 , 2017. |
Fides: Towards a Platform for Responsible Data Science Inproceedings Proceedings of the 29th International Conference on Scientific and Statistical Database Management, Chicago, IL, USA, June 27-29, 2017, pp. 26:1–26:6, 2017. |
Massively Parallel Processing of Whole Genome Sequence Data: An In-Depth Performance Study Inproceedings Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 187–202, 2017. |
Compressed linear algebra for large-scale machine learning Journal Article The VLDB Journal, 2017, ISSN: 0949-877X. |
Differentially Private Rank Aggregation Inproceedings Proceedings of the 2017 SIAM International Conference on Data Mining, Houston, Texas, USA, April 27-29, 2017, pp. 669–677, 2017. |
PeGaSus: Data-Adaptive Differentially Private Stream Processing Inproceedings Proceedings of the 2017 ACM SIGSAC Conference on Computer and Communications Security, CCS 2017, Dallas, TX, USA, October 30 - November 03, 2017, pp. 1375–1388, 2017. |
A theory of pricing private data Journal Article Commun. ACM, 60 (12), pp. 79–86, 2017. |
Redistributing Funds across Charitable Crowdfunding Campaigns Journal Article CoRR, abs/1706.00070 , 2017. |
2016 |
High-Performance XML Message Brokering Incollection Data Stream Management - Processing High-Speed Data Streams, pp. 451–471, 2016. |
A Consumer-Centric Market for Database Computation in the Cloud Journal Article CoRR, abs/1609.02104 , 2016. |
QFix: Diagnosing errors through query histories Journal Article CoRR, abs/1601.07539 , 2016. |
QFix: Demonstrating Error Diagnosis in Query Histories Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2177–2180, 2016. |
Scalable Package Queries in Relational Database Systems Journal Article PVLDB, 9 (7), pp. 576–587, 2016, (Best of VLDB 2016). |
Exploring Privacy-Accuracy Tradeoffs using DPComp Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 2101–2104, 2016. |
Principled Evaluation of Differentially Private Algorithms using DPBench Inproceedings Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 26 - July 01, 2016, pp. 139–154, 2016. |
Data Responsibly: Fairness, Neutrality and Transparency in Data Analysis Inproceedings Proceedings of the 19th International Conference on Extending Database Technology, EDBT 2016, Bordeaux, France, March 15-16, 2016, Bordeaux, France, March 15-16, 2016., pp. 718–719, 2016. |
AIDE: An Active Learning-Based Approach for Interactive Data Exploration Journal Article IEEE Trans. Knowl. Data Eng., 28 (11), pp. 2842–2856, 2016. |
Lifting the Haze off the Cloud: A Consumer-Centric Market for Database Computation in the Cloud Journal Article PVLDB, 10 (4), pp. 373–384, 2016. |
Data, Responsibly (Dagstuhl Seminar 16291) Journal Article Dagstuhl Reports, 6 (7), pp. 42–71, 2016. |
Interactive Data Exploration via Machine Learning Models Journal Article IEEE Data Eng. Bull., 39 (4), pp. 38–49, 2016. |
2015 |
AIDE: An Automatic User Navigation System for Interactive Data Exploration Journal Article PVLDB, 8 (12), pp. 1964–1975, 2015. |
Lower Bounds on the Error of Query Sets Under the Differentially-Private Matrix Mechanism Journal Article Theory of Computing Systems, pp. 1-43, 2015, ISSN: 1432-4350. |
Error Diagnosis and Data Profiling with Data X-Ray Journal Article PVLDB, 8 (12), pp. 1984–1995, 2015. |
Fusing Data with Correlations Journal Article CoRR, abs/1503.00306 , 2015. |