sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot._typing import E 9from sqlglot.dialects.dialect import ( 10 Dialect, 11 NormalizationStrategy, 12 annotate_with_type_lambda, 13 arg_max_or_min_no_count, 14 binary_from_function, 15 date_add_interval_sql, 16 datestrtodate_sql, 17 build_formatted_time, 18 filter_array_using_unnest, 19 if_sql, 20 inline_array_unless_query, 21 max_or_greatest, 22 min_or_least, 23 no_ilike_sql, 24 build_date_delta_with_interval, 25 regexp_replace_sql, 26 rename_func, 27 sha256_sql, 28 timestrtotime_sql, 29 ts_or_ds_add_cast, 30 unit_to_var, 31 strposition_sql, 32 groupconcat_sql, 33) 34from sqlglot.helper import seq_get, split_num_words 35from sqlglot.tokens import TokenType 36from sqlglot.generator import unsupported_args 37 38if t.TYPE_CHECKING: 39 from sqlglot._typing import Lit 40 41 from sqlglot.optimizer.annotate_types import TypeAnnotator 42 43logger = logging.getLogger("sqlglot") 44 45 46JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 47 48DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 49 50 51def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 52 if not expression.find_ancestor(exp.From, exp.Join): 53 return self.values_sql(expression) 54 55 structs = [] 56 alias = expression.args.get("alias") 57 for tup in expression.find_all(exp.Tuple): 58 field_aliases = ( 59 alias.columns 60 if alias and alias.columns 61 else (f"_c{i}" for i in range(len(tup.expressions))) 62 ) 63 expressions = [ 64 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 65 for name, fld in zip(field_aliases, tup.expressions) 66 ] 67 structs.append(exp.Struct(expressions=expressions)) 68 69 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 70 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 71 return self.unnest_sql( 72 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 73 ) 74 75 76def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 77 this = expression.this 78 if isinstance(this, exp.Schema): 79 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 80 else: 81 this = self.sql(this) 82 return f"RETURNS {this}" 83 84 85def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 86 returns = expression.find(exp.ReturnsProperty) 87 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 88 expression.set("kind", "TABLE FUNCTION") 89 90 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 91 expression.set("expression", expression.expression.this) 92 93 return self.create_sql(expression) 94 95 96# https://issuetracker.google.com/issues/162294746 97# workaround for bigquery bug when grouping by an expression and then ordering 98# WITH x AS (SELECT 1 y) 99# SELECT y + 1 z 100# FROM x 101# GROUP BY x + 1 102# ORDER by z 103def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 104 if isinstance(expression, exp.Select): 105 group = expression.args.get("group") 106 order = expression.args.get("order") 107 108 if group and order: 109 aliases = { 110 select.this: select.args["alias"] 111 for select in expression.selects 112 if isinstance(select, exp.Alias) 113 } 114 115 for grouped in group.expressions: 116 if grouped.is_int: 117 continue 118 alias = aliases.get(grouped) 119 if alias: 120 grouped.replace(exp.column(alias)) 121 122 return expression 123 124 125def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 126 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 127 if isinstance(expression, exp.CTE) and expression.alias_column_names: 128 cte_query = expression.this 129 130 if cte_query.is_star: 131 logger.warning( 132 "Can't push down CTE column names for star queries. Run the query through" 133 " the optimizer or use 'qualify' to expand the star projections first." 134 ) 135 return expression 136 137 column_names = expression.alias_column_names 138 expression.args["alias"].set("columns", None) 139 140 for name, select in zip(column_names, cte_query.selects): 141 to_replace = select 142 143 if isinstance(select, exp.Alias): 144 select = select.this 145 146 # Inner aliases are shadowed by the CTE column names 147 to_replace.replace(exp.alias_(select, name)) 148 149 return expression 150 151 152def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 153 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 154 this.set("zone", seq_get(args, 2)) 155 return this 156 157 158def _build_timestamp(args: t.List) -> exp.Timestamp: 159 timestamp = exp.Timestamp.from_arg_list(args) 160 timestamp.set("with_tz", True) 161 return timestamp 162 163 164def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 165 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 166 return expr_type.from_arg_list(args) 167 168 169def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 170 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 171 arg = seq_get(args, 0) 172 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 173 174 175def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 176 return self.sql( 177 exp.Exists( 178 this=exp.select("1") 179 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 180 .where(exp.column("_col").eq(expression.right)) 181 ) 182 ) 183 184 185def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 186 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 187 188 189def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 190 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 191 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 192 unit = unit_to_var(expression) 193 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 194 195 196def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 197 scale = expression.args.get("scale") 198 timestamp = expression.this 199 200 if scale in (None, exp.UnixToTime.SECONDS): 201 return self.func("TIMESTAMP_SECONDS", timestamp) 202 if scale == exp.UnixToTime.MILLIS: 203 return self.func("TIMESTAMP_MILLIS", timestamp) 204 if scale == exp.UnixToTime.MICROS: 205 return self.func("TIMESTAMP_MICROS", timestamp) 206 207 unix_seconds = exp.cast( 208 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 209 ) 210 return self.func("TIMESTAMP_SECONDS", unix_seconds) 211 212 213def _build_time(args: t.List) -> exp.Func: 214 if len(args) == 1: 215 return exp.TsOrDsToTime(this=args[0]) 216 if len(args) == 2: 217 return exp.Time.from_arg_list(args) 218 return exp.TimeFromParts.from_arg_list(args) 219 220 221def _build_datetime(args: t.List) -> exp.Func: 222 if len(args) == 1: 223 return exp.TsOrDsToDatetime.from_arg_list(args) 224 if len(args) == 2: 225 return exp.Datetime.from_arg_list(args) 226 return exp.TimestampFromParts.from_arg_list(args) 227 228 229def _build_regexp_extract( 230 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 231) -> t.Callable[[t.List], E]: 232 def _builder(args: t.List) -> E: 233 try: 234 group = re.compile(args[1].name).groups == 1 235 except re.error: 236 group = False 237 238 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 239 return expr_type( 240 this=seq_get(args, 0), 241 expression=seq_get(args, 1), 242 position=seq_get(args, 2), 243 occurrence=seq_get(args, 3), 244 group=exp.Literal.number(1) if group else default_group, 245 ) 246 247 return _builder 248 249 250def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 251 def _builder(args: t.List, dialect: Dialect) -> E: 252 if len(args) == 1: 253 # The default value for the JSONPath is '$' i.e all of the data 254 args.append(exp.Literal.string("$")) 255 return parser.build_extract_json_with_path(expr_type)(args, dialect) 256 257 return _builder 258 259 260def _str_to_datetime_sql( 261 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 262) -> str: 263 this = self.sql(expression, "this") 264 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 265 266 if expression.args.get("safe"): 267 fmt = self.format_time( 268 expression, 269 self.dialect.INVERSE_FORMAT_MAPPING, 270 self.dialect.INVERSE_FORMAT_TRIE, 271 ) 272 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 273 274 fmt = self.format_time(expression) 275 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 276 277 278def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 279 """ 280 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 281 +---------+---------+---------+------------+---------+ 282 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 283 +---------+---------+---------+------------+---------+ 284 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 285 +---------+---------+---------+------------+---------+ 286 """ 287 self._annotate_args(expression) 288 289 this: exp.Expression = expression.this 290 291 self._set_type( 292 expression, 293 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 294 ) 295 return expression 296 297 298@unsupported_args("ins_cost", "del_cost", "sub_cost") 299def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 300 max_dist = expression.args.get("max_dist") 301 if max_dist: 302 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 303 304 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 305 306 307def _build_levenshtein(args: t.List) -> exp.Levenshtein: 308 max_dist = seq_get(args, 2) 309 return exp.Levenshtein( 310 this=seq_get(args, 0), 311 expression=seq_get(args, 1), 312 max_dist=max_dist.expression if max_dist else None, 313 ) 314 315 316def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 317 def _builder(args: t.List) -> exp.TimeToStr: 318 return exp.TimeToStr( 319 this=expr_type(this=seq_get(args, 1)), 320 format=seq_get(args, 0), 321 zone=seq_get(args, 2), 322 ) 323 324 return _builder 325 326 327def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous: 328 if len(args) == 3: 329 return exp.Anonymous(this="CONTAINS_SUBSTR", expressions=args) 330 331 # Lowercase the operands in case of transpilation, as exp.Contains 332 # is case-sensitive on other dialects 333 this = exp.Lower(this=seq_get(args, 0)) 334 expr = exp.Lower(this=seq_get(args, 1)) 335 336 return exp.Contains(this=this, expression=expr) 337 338 339def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 340 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 341 upper = name.upper() 342 343 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 344 345 if dquote_escaping: 346 self._quote_json_path_key_using_brackets = False 347 348 sql = rename_func(upper)(self, expression) 349 350 if dquote_escaping: 351 self._quote_json_path_key_using_brackets = True 352 353 return sql 354 355 356def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 357 annotated = self._annotate_by_args(expression, "expressions") 358 359 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 360 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 361 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 362 annotated.type = exp.DataType.Type.VARCHAR 363 364 return annotated 365 366 367class BigQuery(Dialect): 368 WEEK_OFFSET = -1 369 UNNEST_COLUMN_ONLY = True 370 SUPPORTS_USER_DEFINED_TYPES = False 371 SUPPORTS_SEMI_ANTI_JOIN = False 372 LOG_BASE_FIRST = False 373 HEX_LOWERCASE = True 374 FORCE_EARLY_ALIAS_REF_EXPANSION = True 375 PRESERVE_ORIGINAL_NAMES = True 376 HEX_STRING_IS_INTEGER_TYPE = True 377 378 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 379 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 380 381 # bigquery udfs are case sensitive 382 NORMALIZE_FUNCTIONS = False 383 384 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 385 TIME_MAPPING = { 386 "%D": "%m/%d/%y", 387 "%E6S": "%S.%f", 388 "%e": "%-d", 389 } 390 391 FORMAT_MAPPING = { 392 "DD": "%d", 393 "MM": "%m", 394 "MON": "%b", 395 "MONTH": "%B", 396 "YYYY": "%Y", 397 "YY": "%y", 398 "HH": "%I", 399 "HH12": "%I", 400 "HH24": "%H", 401 "MI": "%M", 402 "SS": "%S", 403 "SSSSS": "%f", 404 "TZH": "%z", 405 } 406 407 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 408 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 409 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 410 411 # All set operations require either a DISTINCT or ALL specifier 412 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 413 414 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 415 TYPE_TO_EXPRESSIONS = { 416 **Dialect.TYPE_TO_EXPRESSIONS, 417 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 418 } 419 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 420 421 ANNOTATORS = { 422 **Dialect.ANNOTATORS, 423 **{ 424 expr_type: annotate_with_type_lambda(data_type) 425 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 426 for expr_type in expressions 427 }, 428 **{ 429 expr_type: lambda self, e: _annotate_math_functions(self, e) 430 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 431 }, 432 **{ 433 expr_type: lambda self, e: self._annotate_by_args(e, "this") 434 for expr_type in ( 435 exp.Left, 436 exp.Right, 437 exp.Lower, 438 exp.Upper, 439 exp.Pad, 440 exp.Trim, 441 exp.RegexpExtract, 442 exp.RegexpReplace, 443 exp.Repeat, 444 exp.Substring, 445 ) 446 }, 447 exp.Concat: _annotate_concat, 448 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 449 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 450 } 451 452 def normalize_identifier(self, expression: E) -> E: 453 if ( 454 isinstance(expression, exp.Identifier) 455 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 456 ): 457 parent = expression.parent 458 while isinstance(parent, exp.Dot): 459 parent = parent.parent 460 461 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 462 # by default. The following check uses a heuristic to detect tables based on whether 463 # they are qualified. This should generally be correct, because tables in BigQuery 464 # must be qualified with at least a dataset, unless @@dataset_id is set. 465 case_sensitive = ( 466 isinstance(parent, exp.UserDefinedFunction) 467 or ( 468 isinstance(parent, exp.Table) 469 and parent.db 470 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 471 ) 472 or expression.meta.get("is_table") 473 ) 474 if not case_sensitive: 475 expression.set("this", expression.this.lower()) 476 477 return t.cast(E, expression) 478 479 return super().normalize_identifier(expression) 480 481 class Tokenizer(tokens.Tokenizer): 482 QUOTES = ["'", '"', '"""', "'''"] 483 COMMENTS = ["--", "#", ("/*", "*/")] 484 IDENTIFIERS = ["`"] 485 STRING_ESCAPES = ["\\"] 486 487 HEX_STRINGS = [("0x", ""), ("0X", "")] 488 489 BYTE_STRINGS = [ 490 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 491 ] 492 493 RAW_STRINGS = [ 494 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 495 ] 496 497 NESTED_COMMENTS = False 498 499 KEYWORDS = { 500 **tokens.Tokenizer.KEYWORDS, 501 "ANY TYPE": TokenType.VARIANT, 502 "BEGIN": TokenType.COMMAND, 503 "BEGIN TRANSACTION": TokenType.BEGIN, 504 "BYTEINT": TokenType.INT, 505 "BYTES": TokenType.BINARY, 506 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 507 "DATETIME": TokenType.TIMESTAMP, 508 "DECLARE": TokenType.COMMAND, 509 "ELSEIF": TokenType.COMMAND, 510 "EXCEPTION": TokenType.COMMAND, 511 "EXPORT": TokenType.EXPORT, 512 "FLOAT64": TokenType.DOUBLE, 513 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 514 "MODEL": TokenType.MODEL, 515 "NOT DETERMINISTIC": TokenType.VOLATILE, 516 "RECORD": TokenType.STRUCT, 517 "TIMESTAMP": TokenType.TIMESTAMPTZ, 518 } 519 KEYWORDS.pop("DIV") 520 KEYWORDS.pop("VALUES") 521 KEYWORDS.pop("/*+") 522 523 class Parser(parser.Parser): 524 PREFIXED_PIVOT_COLUMNS = True 525 LOG_DEFAULTS_TO_LN = True 526 SUPPORTS_IMPLICIT_UNNEST = True 527 528 # BigQuery does not allow ASC/DESC to be used as an identifier 529 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 530 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 531 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 532 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 533 TokenType.ASC, 534 TokenType.DESC, 535 } 536 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 537 538 FUNCTIONS = { 539 **parser.Parser.FUNCTIONS, 540 "CONTAINS_SUBSTR": _build_contains_substring, 541 "DATE": _build_date, 542 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 543 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 544 "DATE_TRUNC": lambda args: exp.DateTrunc( 545 unit=exp.Literal.string(str(seq_get(args, 1))), 546 this=seq_get(args, 0), 547 zone=seq_get(args, 2), 548 ), 549 "DATETIME": _build_datetime, 550 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 551 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 552 "DIV": binary_from_function(exp.IntDiv), 553 "EDIT_DISTANCE": _build_levenshtein, 554 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 555 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 556 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 557 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 558 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 559 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 560 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 561 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 562 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 563 "MD5": exp.MD5Digest.from_arg_list, 564 "TO_HEX": _build_to_hex, 565 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 566 [seq_get(args, 1), seq_get(args, 0)] 567 ), 568 "PARSE_TIMESTAMP": _build_parse_timestamp, 569 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 570 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 571 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 572 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 573 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 574 ), 575 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 576 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 577 "SPLIT": lambda args: exp.Split( 578 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 579 this=seq_get(args, 0), 580 expression=seq_get(args, 1) or exp.Literal.string(","), 581 ), 582 "STRPOS": exp.StrPosition.from_arg_list, 583 "TIME": _build_time, 584 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 585 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 586 "TIMESTAMP": _build_timestamp, 587 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 588 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 589 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 590 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 591 ), 592 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 593 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 594 ), 595 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 596 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 597 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 598 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 599 } 600 601 FUNCTION_PARSERS = { 602 **parser.Parser.FUNCTION_PARSERS, 603 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 604 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 605 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 606 } 607 FUNCTION_PARSERS.pop("TRIM") 608 609 NO_PAREN_FUNCTIONS = { 610 **parser.Parser.NO_PAREN_FUNCTIONS, 611 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 612 } 613 614 NESTED_TYPE_TOKENS = { 615 *parser.Parser.NESTED_TYPE_TOKENS, 616 TokenType.TABLE, 617 } 618 619 PROPERTY_PARSERS = { 620 **parser.Parser.PROPERTY_PARSERS, 621 "NOT DETERMINISTIC": lambda self: self.expression( 622 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 623 ), 624 "OPTIONS": lambda self: self._parse_with_property(), 625 } 626 627 CONSTRAINT_PARSERS = { 628 **parser.Parser.CONSTRAINT_PARSERS, 629 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 630 } 631 632 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 633 RANGE_PARSERS.pop(TokenType.OVERLAPS) 634 635 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 636 637 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 638 639 STATEMENT_PARSERS = { 640 **parser.Parser.STATEMENT_PARSERS, 641 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 642 TokenType.END: lambda self: self._parse_as_command(self._prev), 643 TokenType.FOR: lambda self: self._parse_for_in(), 644 TokenType.EXPORT: lambda self: self._parse_export_data(), 645 } 646 647 BRACKET_OFFSETS = { 648 "OFFSET": (0, False), 649 "ORDINAL": (1, False), 650 "SAFE_OFFSET": (0, True), 651 "SAFE_ORDINAL": (1, True), 652 } 653 654 def _parse_for_in(self) -> exp.ForIn: 655 this = self._parse_range() 656 self._match_text_seq("DO") 657 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 658 659 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 660 this = super()._parse_table_part(schema=schema) or self._parse_number() 661 662 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 663 if isinstance(this, exp.Identifier): 664 table_name = this.name 665 while self._match(TokenType.DASH, advance=False) and self._next: 666 start = self._curr 667 while self._is_connected() and not self._match_set( 668 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 669 ): 670 self._advance() 671 672 if start == self._curr: 673 break 674 675 table_name += self._find_sql(start, self._prev) 676 677 this = exp.Identifier( 678 this=table_name, quoted=this.args.get("quoted") 679 ).update_positions(this) 680 elif isinstance(this, exp.Literal): 681 table_name = this.name 682 683 if self._is_connected() and self._parse_var(any_token=True): 684 table_name += self._prev.text 685 686 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 687 688 return this 689 690 def _parse_table_parts( 691 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 692 ) -> exp.Table: 693 table = super()._parse_table_parts( 694 schema=schema, is_db_reference=is_db_reference, wildcard=True 695 ) 696 697 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 698 if not table.catalog: 699 if table.db: 700 previous_db = table.args["db"] 701 parts = table.db.split(".") 702 if len(parts) == 2 and not table.args["db"].quoted: 703 table.set( 704 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 705 ) 706 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 707 else: 708 previous_this = table.this 709 parts = table.name.split(".") 710 if len(parts) == 2 and not table.this.quoted: 711 table.set( 712 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 713 ) 714 table.set( 715 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 716 ) 717 718 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 719 alias = table.this 720 catalog, db, this, *rest = ( 721 exp.to_identifier(p, quoted=True) 722 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 723 ) 724 725 for part in (catalog, db, this): 726 if part: 727 part.update_positions(table.this) 728 729 if rest and this: 730 this = exp.Dot.build([this, *rest]) # type: ignore 731 732 table = exp.Table( 733 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 734 ) 735 table.meta["quoted_table"] = True 736 else: 737 alias = None 738 739 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 740 # dataset, so if the project identifier is omitted we need to fix the ast so that 741 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 742 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 743 # views, because it would seem like the "catalog" part is set, when it'd actually 744 # be the region/dataset. Merging the two identifiers into a single one is done to 745 # avoid producing a 4-part Table reference, which would cause issues in the schema 746 # module, when there are 3-part table names mixed with information schema views. 747 # 748 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 749 table_parts = table.parts 750 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 751 # We need to alias the table here to avoid breaking existing qualified columns. 752 # This is expected to be safe, because if there's an actual alias coming up in 753 # the token stream, it will overwrite this one. If there isn't one, we are only 754 # exposing the name that can be used to reference the view explicitly (a no-op). 755 exp.alias_( 756 table, 757 t.cast(exp.Identifier, alias or table_parts[-1]), 758 table=True, 759 copy=False, 760 ) 761 762 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 763 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 764 line=table_parts[-2].meta.get("line"), 765 col=table_parts[-1].meta.get("col"), 766 start=table_parts[-2].meta.get("start"), 767 end=table_parts[-1].meta.get("end"), 768 ) 769 table.set("this", new_this) 770 table.set("db", seq_get(table_parts, -3)) 771 table.set("catalog", seq_get(table_parts, -4)) 772 773 return table 774 775 def _parse_column(self) -> t.Optional[exp.Expression]: 776 column = super()._parse_column() 777 if isinstance(column, exp.Column): 778 parts = column.parts 779 if any("." in p.name for p in parts): 780 catalog, db, table, this, *rest = ( 781 exp.to_identifier(p, quoted=True) 782 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 783 ) 784 785 if rest and this: 786 this = exp.Dot.build([this, *rest]) # type: ignore 787 788 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 789 column.meta["quoted_column"] = True 790 791 return column 792 793 @t.overload 794 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 795 796 @t.overload 797 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 798 799 def _parse_json_object(self, agg=False): 800 json_object = super()._parse_json_object() 801 array_kv_pair = seq_get(json_object.expressions, 0) 802 803 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 804 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 805 if ( 806 array_kv_pair 807 and isinstance(array_kv_pair.this, exp.Array) 808 and isinstance(array_kv_pair.expression, exp.Array) 809 ): 810 keys = array_kv_pair.this.expressions 811 values = array_kv_pair.expression.expressions 812 813 json_object.set( 814 "expressions", 815 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 816 ) 817 818 return json_object 819 820 def _parse_bracket( 821 self, this: t.Optional[exp.Expression] = None 822 ) -> t.Optional[exp.Expression]: 823 bracket = super()._parse_bracket(this) 824 825 if this is bracket: 826 return bracket 827 828 if isinstance(bracket, exp.Bracket): 829 for expression in bracket.expressions: 830 name = expression.name.upper() 831 832 if name not in self.BRACKET_OFFSETS: 833 break 834 835 offset, safe = self.BRACKET_OFFSETS[name] 836 bracket.set("offset", offset) 837 bracket.set("safe", safe) 838 expression.replace(expression.expressions[0]) 839 840 return bracket 841 842 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 843 unnest = super()._parse_unnest(with_alias=with_alias) 844 845 if not unnest: 846 return None 847 848 unnest_expr = seq_get(unnest.expressions, 0) 849 if unnest_expr: 850 from sqlglot.optimizer.annotate_types import annotate_types 851 852 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 853 854 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 855 # in contrast to other dialects such as DuckDB which flattens only the array by default 856 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 857 array_elem.is_type(exp.DataType.Type.STRUCT) 858 for array_elem in unnest_expr._type.expressions 859 ): 860 unnest.set("explode_array", True) 861 862 return unnest 863 864 def _parse_make_interval(self) -> exp.MakeInterval: 865 expr = exp.MakeInterval() 866 867 for arg_key in expr.arg_types: 868 value = self._parse_lambda() 869 870 if not value: 871 break 872 873 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 874 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 875 if isinstance(value, exp.Kwarg): 876 arg_key = value.this.name 877 878 expr.set(arg_key, value) 879 880 self._match(TokenType.COMMA) 881 882 return expr 883 884 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 885 expr = self.expression( 886 exp.FeaturesAtTime, 887 this=(self._match(TokenType.TABLE) and self._parse_table()) 888 or self._parse_select(nested=True), 889 ) 890 891 while self._match(TokenType.COMMA): 892 arg = self._parse_lambda() 893 894 # Get the LHS of the Kwarg and set the arg to that value, e.g 895 # "num_rows => 1" sets the expr's `num_rows` arg 896 if arg: 897 expr.set(arg.this.name, arg) 898 899 return expr 900 901 def _parse_export_data(self) -> exp.Export: 902 self._match_text_seq("DATA") 903 904 return self.expression( 905 exp.Export, 906 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 907 options=self._parse_properties(), 908 this=self._match_text_seq("AS") and self._parse_select(), 909 ) 910 911 class Generator(generator.Generator): 912 INTERVAL_ALLOWS_PLURAL_FORM = False 913 JOIN_HINTS = False 914 QUERY_HINTS = False 915 TABLE_HINTS = False 916 LIMIT_FETCH = "LIMIT" 917 RENAME_TABLE_WITH_DB = False 918 NVL2_SUPPORTED = False 919 UNNEST_WITH_ORDINALITY = False 920 COLLATE_IS_FUNC = True 921 LIMIT_ONLY_LITERALS = True 922 SUPPORTS_TABLE_ALIAS_COLUMNS = False 923 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 924 JSON_KEY_VALUE_PAIR_SEP = "," 925 NULL_ORDERING_SUPPORTED = False 926 IGNORE_NULLS_IN_FUNC = True 927 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 928 CAN_IMPLEMENT_ARRAY_ANY = True 929 SUPPORTS_TO_NUMBER = False 930 NAMED_PLACEHOLDER_TOKEN = "@" 931 HEX_FUNC = "TO_HEX" 932 WITH_PROPERTIES_PREFIX = "OPTIONS" 933 SUPPORTS_EXPLODING_PROJECTIONS = False 934 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 935 SUPPORTS_UNIX_SECONDS = True 936 937 TRANSFORMS = { 938 **generator.Generator.TRANSFORMS, 939 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 940 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 941 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 942 exp.Array: inline_array_unless_query, 943 exp.ArrayContains: _array_contains_sql, 944 exp.ArrayFilter: filter_array_using_unnest, 945 exp.ArrayRemove: filter_array_using_unnest, 946 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 947 exp.CollateProperty: lambda self, e: ( 948 f"DEFAULT COLLATE {self.sql(e, 'this')}" 949 if e.args.get("default") 950 else f"COLLATE {self.sql(e, 'this')}" 951 ), 952 exp.Commit: lambda *_: "COMMIT TRANSACTION", 953 exp.CountIf: rename_func("COUNTIF"), 954 exp.Create: _create_sql, 955 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 956 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 957 exp.DateDiff: lambda self, e: self.func( 958 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 959 ), 960 exp.DateFromParts: rename_func("DATE"), 961 exp.DateStrToDate: datestrtodate_sql, 962 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 963 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 964 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 965 exp.DateTrunc: lambda self, e: self.func( 966 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 967 ), 968 exp.FromTimeZone: lambda self, e: self.func( 969 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 970 ), 971 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 972 exp.GroupConcat: lambda self, e: groupconcat_sql( 973 self, e, func_name="STRING_AGG", within_group=False 974 ), 975 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 976 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 977 exp.If: if_sql(false_value="NULL"), 978 exp.ILike: no_ilike_sql, 979 exp.IntDiv: rename_func("DIV"), 980 exp.Int64: rename_func("INT64"), 981 exp.JSONExtract: _json_extract_sql, 982 exp.JSONExtractArray: _json_extract_sql, 983 exp.JSONExtractScalar: _json_extract_sql, 984 exp.JSONFormat: rename_func("TO_JSON_STRING"), 985 exp.Levenshtein: _levenshtein_sql, 986 exp.Max: max_or_greatest, 987 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 988 exp.MD5Digest: rename_func("MD5"), 989 exp.Min: min_or_least, 990 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 991 exp.RegexpExtract: lambda self, e: self.func( 992 "REGEXP_EXTRACT", 993 e.this, 994 e.expression, 995 e.args.get("position"), 996 e.args.get("occurrence"), 997 ), 998 exp.RegexpExtractAll: lambda self, e: self.func( 999 "REGEXP_EXTRACT_ALL", e.this, e.expression 1000 ), 1001 exp.RegexpReplace: regexp_replace_sql, 1002 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1003 exp.ReturnsProperty: _returnsproperty_sql, 1004 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1005 exp.Select: transforms.preprocess( 1006 [ 1007 transforms.explode_projection_to_unnest(), 1008 transforms.unqualify_unnest, 1009 transforms.eliminate_distinct_on, 1010 _alias_ordered_group, 1011 transforms.eliminate_semi_and_anti_joins, 1012 ] 1013 ), 1014 exp.SHA: rename_func("SHA1"), 1015 exp.SHA2: sha256_sql, 1016 exp.StabilityProperty: lambda self, e: ( 1017 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1018 ), 1019 exp.String: rename_func("STRING"), 1020 exp.StrPosition: lambda self, e: ( 1021 strposition_sql( 1022 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1023 ) 1024 ), 1025 exp.StrToDate: _str_to_datetime_sql, 1026 exp.StrToTime: _str_to_datetime_sql, 1027 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1028 exp.TimeFromParts: rename_func("TIME"), 1029 exp.TimestampFromParts: rename_func("DATETIME"), 1030 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1031 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1032 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1033 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1034 exp.TimeStrToTime: timestrtotime_sql, 1035 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1036 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1037 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1038 exp.TsOrDsToTime: rename_func("TIME"), 1039 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1040 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1041 exp.Unhex: rename_func("FROM_HEX"), 1042 exp.UnixDate: rename_func("UNIX_DATE"), 1043 exp.UnixToTime: _unix_to_time_sql, 1044 exp.Uuid: lambda *_: "GENERATE_UUID()", 1045 exp.Values: _derived_table_values_to_unnest, 1046 exp.VariancePop: rename_func("VAR_POP"), 1047 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1048 } 1049 1050 SUPPORTED_JSON_PATH_PARTS = { 1051 exp.JSONPathKey, 1052 exp.JSONPathRoot, 1053 exp.JSONPathSubscript, 1054 } 1055 1056 TYPE_MAPPING = { 1057 **generator.Generator.TYPE_MAPPING, 1058 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1059 exp.DataType.Type.BIGINT: "INT64", 1060 exp.DataType.Type.BINARY: "BYTES", 1061 exp.DataType.Type.BLOB: "BYTES", 1062 exp.DataType.Type.BOOLEAN: "BOOL", 1063 exp.DataType.Type.CHAR: "STRING", 1064 exp.DataType.Type.DECIMAL: "NUMERIC", 1065 exp.DataType.Type.DOUBLE: "FLOAT64", 1066 exp.DataType.Type.FLOAT: "FLOAT64", 1067 exp.DataType.Type.INT: "INT64", 1068 exp.DataType.Type.NCHAR: "STRING", 1069 exp.DataType.Type.NVARCHAR: "STRING", 1070 exp.DataType.Type.SMALLINT: "INT64", 1071 exp.DataType.Type.TEXT: "STRING", 1072 exp.DataType.Type.TIMESTAMP: "DATETIME", 1073 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1074 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1075 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1076 exp.DataType.Type.TINYINT: "INT64", 1077 exp.DataType.Type.ROWVERSION: "BYTES", 1078 exp.DataType.Type.UUID: "STRING", 1079 exp.DataType.Type.VARBINARY: "BYTES", 1080 exp.DataType.Type.VARCHAR: "STRING", 1081 exp.DataType.Type.VARIANT: "ANY TYPE", 1082 } 1083 1084 PROPERTIES_LOCATION = { 1085 **generator.Generator.PROPERTIES_LOCATION, 1086 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1087 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1088 } 1089 1090 # WINDOW comes after QUALIFY 1091 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1092 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1093 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1094 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1095 } 1096 1097 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1098 RESERVED_KEYWORDS = { 1099 "all", 1100 "and", 1101 "any", 1102 "array", 1103 "as", 1104 "asc", 1105 "assert_rows_modified", 1106 "at", 1107 "between", 1108 "by", 1109 "case", 1110 "cast", 1111 "collate", 1112 "contains", 1113 "create", 1114 "cross", 1115 "cube", 1116 "current", 1117 "default", 1118 "define", 1119 "desc", 1120 "distinct", 1121 "else", 1122 "end", 1123 "enum", 1124 "escape", 1125 "except", 1126 "exclude", 1127 "exists", 1128 "extract", 1129 "false", 1130 "fetch", 1131 "following", 1132 "for", 1133 "from", 1134 "full", 1135 "group", 1136 "grouping", 1137 "groups", 1138 "hash", 1139 "having", 1140 "if", 1141 "ignore", 1142 "in", 1143 "inner", 1144 "intersect", 1145 "interval", 1146 "into", 1147 "is", 1148 "join", 1149 "lateral", 1150 "left", 1151 "like", 1152 "limit", 1153 "lookup", 1154 "merge", 1155 "natural", 1156 "new", 1157 "no", 1158 "not", 1159 "null", 1160 "nulls", 1161 "of", 1162 "on", 1163 "or", 1164 "order", 1165 "outer", 1166 "over", 1167 "partition", 1168 "preceding", 1169 "proto", 1170 "qualify", 1171 "range", 1172 "recursive", 1173 "respect", 1174 "right", 1175 "rollup", 1176 "rows", 1177 "select", 1178 "set", 1179 "some", 1180 "struct", 1181 "tablesample", 1182 "then", 1183 "to", 1184 "treat", 1185 "true", 1186 "unbounded", 1187 "union", 1188 "unnest", 1189 "using", 1190 "when", 1191 "where", 1192 "window", 1193 "with", 1194 "within", 1195 } 1196 1197 def mod_sql(self, expression: exp.Mod) -> str: 1198 this = expression.this 1199 expr = expression.expression 1200 return self.func( 1201 "MOD", 1202 this.unnest() if isinstance(this, exp.Paren) else this, 1203 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1204 ) 1205 1206 def column_parts(self, expression: exp.Column) -> str: 1207 if expression.meta.get("quoted_column"): 1208 # If a column reference is of the form `dataset.table`.name, we need 1209 # to preserve the quoted table path, otherwise the reference breaks 1210 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1211 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1212 return f"{table_path}.{self.sql(expression, 'this')}" 1213 1214 return super().column_parts(expression) 1215 1216 def table_parts(self, expression: exp.Table) -> str: 1217 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1218 # we need to make sure the correct quoting is used in each case. 1219 # 1220 # For example, if there is a CTE x that clashes with a schema name, then the former will 1221 # return the table y in that schema, whereas the latter will return the CTE's y column: 1222 # 1223 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1224 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1225 if expression.meta.get("quoted_table"): 1226 table_parts = ".".join(p.name for p in expression.parts) 1227 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1228 1229 return super().table_parts(expression) 1230 1231 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1232 this = expression.this 1233 if isinstance(this, exp.TsOrDsToDatetime): 1234 func_name = "FORMAT_DATETIME" 1235 elif isinstance(this, exp.TsOrDsToTimestamp): 1236 func_name = "FORMAT_TIMESTAMP" 1237 else: 1238 func_name = "FORMAT_DATE" 1239 1240 time_expr = ( 1241 this 1242 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1243 else expression 1244 ) 1245 return self.func( 1246 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1247 ) 1248 1249 def eq_sql(self, expression: exp.EQ) -> str: 1250 # Operands of = cannot be NULL in BigQuery 1251 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1252 if not isinstance(expression.parent, exp.Update): 1253 return "NULL" 1254 1255 return self.binary(expression, "=") 1256 1257 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1258 parent = expression.parent 1259 1260 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1261 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1262 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1263 return self.func( 1264 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1265 ) 1266 1267 return super().attimezone_sql(expression) 1268 1269 def trycast_sql(self, expression: exp.TryCast) -> str: 1270 return self.cast_sql(expression, safe_prefix="SAFE_") 1271 1272 def bracket_sql(self, expression: exp.Bracket) -> str: 1273 this = expression.this 1274 expressions = expression.expressions 1275 1276 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1277 arg = expressions[0] 1278 if arg.type is None: 1279 from sqlglot.optimizer.annotate_types import annotate_types 1280 1281 arg = annotate_types(arg, dialect=self.dialect) 1282 1283 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1284 # BQ doesn't support bracket syntax with string values for structs 1285 return f"{self.sql(this)}.{arg.name}" 1286 1287 expressions_sql = self.expressions(expression, flat=True) 1288 offset = expression.args.get("offset") 1289 1290 if offset == 0: 1291 expressions_sql = f"OFFSET({expressions_sql})" 1292 elif offset == 1: 1293 expressions_sql = f"ORDINAL({expressions_sql})" 1294 elif offset is not None: 1295 self.unsupported(f"Unsupported array offset: {offset}") 1296 1297 if expression.args.get("safe"): 1298 expressions_sql = f"SAFE_{expressions_sql}" 1299 1300 return f"{self.sql(this)}[{expressions_sql}]" 1301 1302 def in_unnest_op(self, expression: exp.Unnest) -> str: 1303 return self.sql(expression) 1304 1305 def version_sql(self, expression: exp.Version) -> str: 1306 if expression.name == "TIMESTAMP": 1307 expression.set("this", "SYSTEM_TIME") 1308 return super().version_sql(expression) 1309 1310 def contains_sql(self, expression: exp.Contains) -> str: 1311 this = expression.this 1312 expr = expression.expression 1313 1314 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1315 this = this.this 1316 expr = expr.this 1317 1318 return self.func("CONTAINS_SUBSTR", this, expr) 1319 1320 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1321 this = expression.this 1322 1323 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1324 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1325 # because they aren't literals and so the above syntax is invalid BigQuery. 1326 if isinstance(this, exp.Array): 1327 elem = seq_get(this.expressions, 0) 1328 if not (elem and elem.find(exp.Query)): 1329 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1330 1331 return super().cast_sql(expression, safe_prefix=safe_prefix)
368class BigQuery(Dialect): 369 WEEK_OFFSET = -1 370 UNNEST_COLUMN_ONLY = True 371 SUPPORTS_USER_DEFINED_TYPES = False 372 SUPPORTS_SEMI_ANTI_JOIN = False 373 LOG_BASE_FIRST = False 374 HEX_LOWERCASE = True 375 FORCE_EARLY_ALIAS_REF_EXPANSION = True 376 PRESERVE_ORIGINAL_NAMES = True 377 HEX_STRING_IS_INTEGER_TYPE = True 378 379 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 380 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 381 382 # bigquery udfs are case sensitive 383 NORMALIZE_FUNCTIONS = False 384 385 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 386 TIME_MAPPING = { 387 "%D": "%m/%d/%y", 388 "%E6S": "%S.%f", 389 "%e": "%-d", 390 } 391 392 FORMAT_MAPPING = { 393 "DD": "%d", 394 "MM": "%m", 395 "MON": "%b", 396 "MONTH": "%B", 397 "YYYY": "%Y", 398 "YY": "%y", 399 "HH": "%I", 400 "HH12": "%I", 401 "HH24": "%H", 402 "MI": "%M", 403 "SS": "%S", 404 "SSSSS": "%f", 405 "TZH": "%z", 406 } 407 408 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 409 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 410 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 411 412 # All set operations require either a DISTINCT or ALL specifier 413 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 414 415 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 416 TYPE_TO_EXPRESSIONS = { 417 **Dialect.TYPE_TO_EXPRESSIONS, 418 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 419 } 420 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 421 422 ANNOTATORS = { 423 **Dialect.ANNOTATORS, 424 **{ 425 expr_type: annotate_with_type_lambda(data_type) 426 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 427 for expr_type in expressions 428 }, 429 **{ 430 expr_type: lambda self, e: _annotate_math_functions(self, e) 431 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 432 }, 433 **{ 434 expr_type: lambda self, e: self._annotate_by_args(e, "this") 435 for expr_type in ( 436 exp.Left, 437 exp.Right, 438 exp.Lower, 439 exp.Upper, 440 exp.Pad, 441 exp.Trim, 442 exp.RegexpExtract, 443 exp.RegexpReplace, 444 exp.Repeat, 445 exp.Substring, 446 ) 447 }, 448 exp.Concat: _annotate_concat, 449 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 450 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 451 } 452 453 def normalize_identifier(self, expression: E) -> E: 454 if ( 455 isinstance(expression, exp.Identifier) 456 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 457 ): 458 parent = expression.parent 459 while isinstance(parent, exp.Dot): 460 parent = parent.parent 461 462 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 463 # by default. The following check uses a heuristic to detect tables based on whether 464 # they are qualified. This should generally be correct, because tables in BigQuery 465 # must be qualified with at least a dataset, unless @@dataset_id is set. 466 case_sensitive = ( 467 isinstance(parent, exp.UserDefinedFunction) 468 or ( 469 isinstance(parent, exp.Table) 470 and parent.db 471 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 472 ) 473 or expression.meta.get("is_table") 474 ) 475 if not case_sensitive: 476 expression.set("this", expression.this.lower()) 477 478 return t.cast(E, expression) 479 480 return super().normalize_identifier(expression) 481 482 class Tokenizer(tokens.Tokenizer): 483 QUOTES = ["'", '"', '"""', "'''"] 484 COMMENTS = ["--", "#", ("/*", "*/")] 485 IDENTIFIERS = ["`"] 486 STRING_ESCAPES = ["\\"] 487 488 HEX_STRINGS = [("0x", ""), ("0X", "")] 489 490 BYTE_STRINGS = [ 491 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 492 ] 493 494 RAW_STRINGS = [ 495 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 496 ] 497 498 NESTED_COMMENTS = False 499 500 KEYWORDS = { 501 **tokens.Tokenizer.KEYWORDS, 502 "ANY TYPE": TokenType.VARIANT, 503 "BEGIN": TokenType.COMMAND, 504 "BEGIN TRANSACTION": TokenType.BEGIN, 505 "BYTEINT": TokenType.INT, 506 "BYTES": TokenType.BINARY, 507 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 508 "DATETIME": TokenType.TIMESTAMP, 509 "DECLARE": TokenType.COMMAND, 510 "ELSEIF": TokenType.COMMAND, 511 "EXCEPTION": TokenType.COMMAND, 512 "EXPORT": TokenType.EXPORT, 513 "FLOAT64": TokenType.DOUBLE, 514 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 515 "MODEL": TokenType.MODEL, 516 "NOT DETERMINISTIC": TokenType.VOLATILE, 517 "RECORD": TokenType.STRUCT, 518 "TIMESTAMP": TokenType.TIMESTAMPTZ, 519 } 520 KEYWORDS.pop("DIV") 521 KEYWORDS.pop("VALUES") 522 KEYWORDS.pop("/*+") 523 524 class Parser(parser.Parser): 525 PREFIXED_PIVOT_COLUMNS = True 526 LOG_DEFAULTS_TO_LN = True 527 SUPPORTS_IMPLICIT_UNNEST = True 528 529 # BigQuery does not allow ASC/DESC to be used as an identifier 530 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 531 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 532 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 533 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 534 TokenType.ASC, 535 TokenType.DESC, 536 } 537 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 538 539 FUNCTIONS = { 540 **parser.Parser.FUNCTIONS, 541 "CONTAINS_SUBSTR": _build_contains_substring, 542 "DATE": _build_date, 543 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 544 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 545 "DATE_TRUNC": lambda args: exp.DateTrunc( 546 unit=exp.Literal.string(str(seq_get(args, 1))), 547 this=seq_get(args, 0), 548 zone=seq_get(args, 2), 549 ), 550 "DATETIME": _build_datetime, 551 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 552 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 553 "DIV": binary_from_function(exp.IntDiv), 554 "EDIT_DISTANCE": _build_levenshtein, 555 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 556 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 557 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 558 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 559 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 560 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 561 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 562 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 563 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 564 "MD5": exp.MD5Digest.from_arg_list, 565 "TO_HEX": _build_to_hex, 566 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 567 [seq_get(args, 1), seq_get(args, 0)] 568 ), 569 "PARSE_TIMESTAMP": _build_parse_timestamp, 570 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 571 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 572 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 573 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 574 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 575 ), 576 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 577 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 578 "SPLIT": lambda args: exp.Split( 579 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 580 this=seq_get(args, 0), 581 expression=seq_get(args, 1) or exp.Literal.string(","), 582 ), 583 "STRPOS": exp.StrPosition.from_arg_list, 584 "TIME": _build_time, 585 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 586 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 587 "TIMESTAMP": _build_timestamp, 588 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 589 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 590 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 591 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 592 ), 593 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 594 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 595 ), 596 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 597 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 598 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 599 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 600 } 601 602 FUNCTION_PARSERS = { 603 **parser.Parser.FUNCTION_PARSERS, 604 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 605 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 606 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 607 } 608 FUNCTION_PARSERS.pop("TRIM") 609 610 NO_PAREN_FUNCTIONS = { 611 **parser.Parser.NO_PAREN_FUNCTIONS, 612 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 613 } 614 615 NESTED_TYPE_TOKENS = { 616 *parser.Parser.NESTED_TYPE_TOKENS, 617 TokenType.TABLE, 618 } 619 620 PROPERTY_PARSERS = { 621 **parser.Parser.PROPERTY_PARSERS, 622 "NOT DETERMINISTIC": lambda self: self.expression( 623 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 624 ), 625 "OPTIONS": lambda self: self._parse_with_property(), 626 } 627 628 CONSTRAINT_PARSERS = { 629 **parser.Parser.CONSTRAINT_PARSERS, 630 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 631 } 632 633 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 634 RANGE_PARSERS.pop(TokenType.OVERLAPS) 635 636 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 637 638 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 639 640 STATEMENT_PARSERS = { 641 **parser.Parser.STATEMENT_PARSERS, 642 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 643 TokenType.END: lambda self: self._parse_as_command(self._prev), 644 TokenType.FOR: lambda self: self._parse_for_in(), 645 TokenType.EXPORT: lambda self: self._parse_export_data(), 646 } 647 648 BRACKET_OFFSETS = { 649 "OFFSET": (0, False), 650 "ORDINAL": (1, False), 651 "SAFE_OFFSET": (0, True), 652 "SAFE_ORDINAL": (1, True), 653 } 654 655 def _parse_for_in(self) -> exp.ForIn: 656 this = self._parse_range() 657 self._match_text_seq("DO") 658 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 659 660 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 661 this = super()._parse_table_part(schema=schema) or self._parse_number() 662 663 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 664 if isinstance(this, exp.Identifier): 665 table_name = this.name 666 while self._match(TokenType.DASH, advance=False) and self._next: 667 start = self._curr 668 while self._is_connected() and not self._match_set( 669 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 670 ): 671 self._advance() 672 673 if start == self._curr: 674 break 675 676 table_name += self._find_sql(start, self._prev) 677 678 this = exp.Identifier( 679 this=table_name, quoted=this.args.get("quoted") 680 ).update_positions(this) 681 elif isinstance(this, exp.Literal): 682 table_name = this.name 683 684 if self._is_connected() and self._parse_var(any_token=True): 685 table_name += self._prev.text 686 687 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 688 689 return this 690 691 def _parse_table_parts( 692 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 693 ) -> exp.Table: 694 table = super()._parse_table_parts( 695 schema=schema, is_db_reference=is_db_reference, wildcard=True 696 ) 697 698 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 699 if not table.catalog: 700 if table.db: 701 previous_db = table.args["db"] 702 parts = table.db.split(".") 703 if len(parts) == 2 and not table.args["db"].quoted: 704 table.set( 705 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 706 ) 707 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 708 else: 709 previous_this = table.this 710 parts = table.name.split(".") 711 if len(parts) == 2 and not table.this.quoted: 712 table.set( 713 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 714 ) 715 table.set( 716 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 717 ) 718 719 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 720 alias = table.this 721 catalog, db, this, *rest = ( 722 exp.to_identifier(p, quoted=True) 723 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 724 ) 725 726 for part in (catalog, db, this): 727 if part: 728 part.update_positions(table.this) 729 730 if rest and this: 731 this = exp.Dot.build([this, *rest]) # type: ignore 732 733 table = exp.Table( 734 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 735 ) 736 table.meta["quoted_table"] = True 737 else: 738 alias = None 739 740 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 741 # dataset, so if the project identifier is omitted we need to fix the ast so that 742 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 743 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 744 # views, because it would seem like the "catalog" part is set, when it'd actually 745 # be the region/dataset. Merging the two identifiers into a single one is done to 746 # avoid producing a 4-part Table reference, which would cause issues in the schema 747 # module, when there are 3-part table names mixed with information schema views. 748 # 749 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 750 table_parts = table.parts 751 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 752 # We need to alias the table here to avoid breaking existing qualified columns. 753 # This is expected to be safe, because if there's an actual alias coming up in 754 # the token stream, it will overwrite this one. If there isn't one, we are only 755 # exposing the name that can be used to reference the view explicitly (a no-op). 756 exp.alias_( 757 table, 758 t.cast(exp.Identifier, alias or table_parts[-1]), 759 table=True, 760 copy=False, 761 ) 762 763 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 764 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 765 line=table_parts[-2].meta.get("line"), 766 col=table_parts[-1].meta.get("col"), 767 start=table_parts[-2].meta.get("start"), 768 end=table_parts[-1].meta.get("end"), 769 ) 770 table.set("this", new_this) 771 table.set("db", seq_get(table_parts, -3)) 772 table.set("catalog", seq_get(table_parts, -4)) 773 774 return table 775 776 def _parse_column(self) -> t.Optional[exp.Expression]: 777 column = super()._parse_column() 778 if isinstance(column, exp.Column): 779 parts = column.parts 780 if any("." in p.name for p in parts): 781 catalog, db, table, this, *rest = ( 782 exp.to_identifier(p, quoted=True) 783 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 784 ) 785 786 if rest and this: 787 this = exp.Dot.build([this, *rest]) # type: ignore 788 789 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 790 column.meta["quoted_column"] = True 791 792 return column 793 794 @t.overload 795 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 796 797 @t.overload 798 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 799 800 def _parse_json_object(self, agg=False): 801 json_object = super()._parse_json_object() 802 array_kv_pair = seq_get(json_object.expressions, 0) 803 804 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 805 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 806 if ( 807 array_kv_pair 808 and isinstance(array_kv_pair.this, exp.Array) 809 and isinstance(array_kv_pair.expression, exp.Array) 810 ): 811 keys = array_kv_pair.this.expressions 812 values = array_kv_pair.expression.expressions 813 814 json_object.set( 815 "expressions", 816 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 817 ) 818 819 return json_object 820 821 def _parse_bracket( 822 self, this: t.Optional[exp.Expression] = None 823 ) -> t.Optional[exp.Expression]: 824 bracket = super()._parse_bracket(this) 825 826 if this is bracket: 827 return bracket 828 829 if isinstance(bracket, exp.Bracket): 830 for expression in bracket.expressions: 831 name = expression.name.upper() 832 833 if name not in self.BRACKET_OFFSETS: 834 break 835 836 offset, safe = self.BRACKET_OFFSETS[name] 837 bracket.set("offset", offset) 838 bracket.set("safe", safe) 839 expression.replace(expression.expressions[0]) 840 841 return bracket 842 843 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 844 unnest = super()._parse_unnest(with_alias=with_alias) 845 846 if not unnest: 847 return None 848 849 unnest_expr = seq_get(unnest.expressions, 0) 850 if unnest_expr: 851 from sqlglot.optimizer.annotate_types import annotate_types 852 853 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 854 855 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 856 # in contrast to other dialects such as DuckDB which flattens only the array by default 857 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 858 array_elem.is_type(exp.DataType.Type.STRUCT) 859 for array_elem in unnest_expr._type.expressions 860 ): 861 unnest.set("explode_array", True) 862 863 return unnest 864 865 def _parse_make_interval(self) -> exp.MakeInterval: 866 expr = exp.MakeInterval() 867 868 for arg_key in expr.arg_types: 869 value = self._parse_lambda() 870 871 if not value: 872 break 873 874 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 875 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 876 if isinstance(value, exp.Kwarg): 877 arg_key = value.this.name 878 879 expr.set(arg_key, value) 880 881 self._match(TokenType.COMMA) 882 883 return expr 884 885 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 886 expr = self.expression( 887 exp.FeaturesAtTime, 888 this=(self._match(TokenType.TABLE) and self._parse_table()) 889 or self._parse_select(nested=True), 890 ) 891 892 while self._match(TokenType.COMMA): 893 arg = self._parse_lambda() 894 895 # Get the LHS of the Kwarg and set the arg to that value, e.g 896 # "num_rows => 1" sets the expr's `num_rows` arg 897 if arg: 898 expr.set(arg.this.name, arg) 899 900 return expr 901 902 def _parse_export_data(self) -> exp.Export: 903 self._match_text_seq("DATA") 904 905 return self.expression( 906 exp.Export, 907 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 908 options=self._parse_properties(), 909 this=self._match_text_seq("AS") and self._parse_select(), 910 ) 911 912 class Generator(generator.Generator): 913 INTERVAL_ALLOWS_PLURAL_FORM = False 914 JOIN_HINTS = False 915 QUERY_HINTS = False 916 TABLE_HINTS = False 917 LIMIT_FETCH = "LIMIT" 918 RENAME_TABLE_WITH_DB = False 919 NVL2_SUPPORTED = False 920 UNNEST_WITH_ORDINALITY = False 921 COLLATE_IS_FUNC = True 922 LIMIT_ONLY_LITERALS = True 923 SUPPORTS_TABLE_ALIAS_COLUMNS = False 924 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 925 JSON_KEY_VALUE_PAIR_SEP = "," 926 NULL_ORDERING_SUPPORTED = False 927 IGNORE_NULLS_IN_FUNC = True 928 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 929 CAN_IMPLEMENT_ARRAY_ANY = True 930 SUPPORTS_TO_NUMBER = False 931 NAMED_PLACEHOLDER_TOKEN = "@" 932 HEX_FUNC = "TO_HEX" 933 WITH_PROPERTIES_PREFIX = "OPTIONS" 934 SUPPORTS_EXPLODING_PROJECTIONS = False 935 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 936 SUPPORTS_UNIX_SECONDS = True 937 938 TRANSFORMS = { 939 **generator.Generator.TRANSFORMS, 940 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 941 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 942 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 943 exp.Array: inline_array_unless_query, 944 exp.ArrayContains: _array_contains_sql, 945 exp.ArrayFilter: filter_array_using_unnest, 946 exp.ArrayRemove: filter_array_using_unnest, 947 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 948 exp.CollateProperty: lambda self, e: ( 949 f"DEFAULT COLLATE {self.sql(e, 'this')}" 950 if e.args.get("default") 951 else f"COLLATE {self.sql(e, 'this')}" 952 ), 953 exp.Commit: lambda *_: "COMMIT TRANSACTION", 954 exp.CountIf: rename_func("COUNTIF"), 955 exp.Create: _create_sql, 956 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 957 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 958 exp.DateDiff: lambda self, e: self.func( 959 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 960 ), 961 exp.DateFromParts: rename_func("DATE"), 962 exp.DateStrToDate: datestrtodate_sql, 963 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 964 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 965 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 966 exp.DateTrunc: lambda self, e: self.func( 967 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 968 ), 969 exp.FromTimeZone: lambda self, e: self.func( 970 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 971 ), 972 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 973 exp.GroupConcat: lambda self, e: groupconcat_sql( 974 self, e, func_name="STRING_AGG", within_group=False 975 ), 976 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 977 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 978 exp.If: if_sql(false_value="NULL"), 979 exp.ILike: no_ilike_sql, 980 exp.IntDiv: rename_func("DIV"), 981 exp.Int64: rename_func("INT64"), 982 exp.JSONExtract: _json_extract_sql, 983 exp.JSONExtractArray: _json_extract_sql, 984 exp.JSONExtractScalar: _json_extract_sql, 985 exp.JSONFormat: rename_func("TO_JSON_STRING"), 986 exp.Levenshtein: _levenshtein_sql, 987 exp.Max: max_or_greatest, 988 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 989 exp.MD5Digest: rename_func("MD5"), 990 exp.Min: min_or_least, 991 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 992 exp.RegexpExtract: lambda self, e: self.func( 993 "REGEXP_EXTRACT", 994 e.this, 995 e.expression, 996 e.args.get("position"), 997 e.args.get("occurrence"), 998 ), 999 exp.RegexpExtractAll: lambda self, e: self.func( 1000 "REGEXP_EXTRACT_ALL", e.this, e.expression 1001 ), 1002 exp.RegexpReplace: regexp_replace_sql, 1003 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1004 exp.ReturnsProperty: _returnsproperty_sql, 1005 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1006 exp.Select: transforms.preprocess( 1007 [ 1008 transforms.explode_projection_to_unnest(), 1009 transforms.unqualify_unnest, 1010 transforms.eliminate_distinct_on, 1011 _alias_ordered_group, 1012 transforms.eliminate_semi_and_anti_joins, 1013 ] 1014 ), 1015 exp.SHA: rename_func("SHA1"), 1016 exp.SHA2: sha256_sql, 1017 exp.StabilityProperty: lambda self, e: ( 1018 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1019 ), 1020 exp.String: rename_func("STRING"), 1021 exp.StrPosition: lambda self, e: ( 1022 strposition_sql( 1023 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1024 ) 1025 ), 1026 exp.StrToDate: _str_to_datetime_sql, 1027 exp.StrToTime: _str_to_datetime_sql, 1028 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1029 exp.TimeFromParts: rename_func("TIME"), 1030 exp.TimestampFromParts: rename_func("DATETIME"), 1031 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1032 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1033 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1034 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1035 exp.TimeStrToTime: timestrtotime_sql, 1036 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1037 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1038 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1039 exp.TsOrDsToTime: rename_func("TIME"), 1040 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1041 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1042 exp.Unhex: rename_func("FROM_HEX"), 1043 exp.UnixDate: rename_func("UNIX_DATE"), 1044 exp.UnixToTime: _unix_to_time_sql, 1045 exp.Uuid: lambda *_: "GENERATE_UUID()", 1046 exp.Values: _derived_table_values_to_unnest, 1047 exp.VariancePop: rename_func("VAR_POP"), 1048 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1049 } 1050 1051 SUPPORTED_JSON_PATH_PARTS = { 1052 exp.JSONPathKey, 1053 exp.JSONPathRoot, 1054 exp.JSONPathSubscript, 1055 } 1056 1057 TYPE_MAPPING = { 1058 **generator.Generator.TYPE_MAPPING, 1059 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1060 exp.DataType.Type.BIGINT: "INT64", 1061 exp.DataType.Type.BINARY: "BYTES", 1062 exp.DataType.Type.BLOB: "BYTES", 1063 exp.DataType.Type.BOOLEAN: "BOOL", 1064 exp.DataType.Type.CHAR: "STRING", 1065 exp.DataType.Type.DECIMAL: "NUMERIC", 1066 exp.DataType.Type.DOUBLE: "FLOAT64", 1067 exp.DataType.Type.FLOAT: "FLOAT64", 1068 exp.DataType.Type.INT: "INT64", 1069 exp.DataType.Type.NCHAR: "STRING", 1070 exp.DataType.Type.NVARCHAR: "STRING", 1071 exp.DataType.Type.SMALLINT: "INT64", 1072 exp.DataType.Type.TEXT: "STRING", 1073 exp.DataType.Type.TIMESTAMP: "DATETIME", 1074 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1075 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1076 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1077 exp.DataType.Type.TINYINT: "INT64", 1078 exp.DataType.Type.ROWVERSION: "BYTES", 1079 exp.DataType.Type.UUID: "STRING", 1080 exp.DataType.Type.VARBINARY: "BYTES", 1081 exp.DataType.Type.VARCHAR: "STRING", 1082 exp.DataType.Type.VARIANT: "ANY TYPE", 1083 } 1084 1085 PROPERTIES_LOCATION = { 1086 **generator.Generator.PROPERTIES_LOCATION, 1087 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1088 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1089 } 1090 1091 # WINDOW comes after QUALIFY 1092 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1093 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1094 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1095 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1096 } 1097 1098 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1099 RESERVED_KEYWORDS = { 1100 "all", 1101 "and", 1102 "any", 1103 "array", 1104 "as", 1105 "asc", 1106 "assert_rows_modified", 1107 "at", 1108 "between", 1109 "by", 1110 "case", 1111 "cast", 1112 "collate", 1113 "contains", 1114 "create", 1115 "cross", 1116 "cube", 1117 "current", 1118 "default", 1119 "define", 1120 "desc", 1121 "distinct", 1122 "else", 1123 "end", 1124 "enum", 1125 "escape", 1126 "except", 1127 "exclude", 1128 "exists", 1129 "extract", 1130 "false", 1131 "fetch", 1132 "following", 1133 "for", 1134 "from", 1135 "full", 1136 "group", 1137 "grouping", 1138 "groups", 1139 "hash", 1140 "having", 1141 "if", 1142 "ignore", 1143 "in", 1144 "inner", 1145 "intersect", 1146 "interval", 1147 "into", 1148 "is", 1149 "join", 1150 "lateral", 1151 "left", 1152 "like", 1153 "limit", 1154 "lookup", 1155 "merge", 1156 "natural", 1157 "new", 1158 "no", 1159 "not", 1160 "null", 1161 "nulls", 1162 "of", 1163 "on", 1164 "or", 1165 "order", 1166 "outer", 1167 "over", 1168 "partition", 1169 "preceding", 1170 "proto", 1171 "qualify", 1172 "range", 1173 "recursive", 1174 "respect", 1175 "right", 1176 "rollup", 1177 "rows", 1178 "select", 1179 "set", 1180 "some", 1181 "struct", 1182 "tablesample", 1183 "then", 1184 "to", 1185 "treat", 1186 "true", 1187 "unbounded", 1188 "union", 1189 "unnest", 1190 "using", 1191 "when", 1192 "where", 1193 "window", 1194 "with", 1195 "within", 1196 } 1197 1198 def mod_sql(self, expression: exp.Mod) -> str: 1199 this = expression.this 1200 expr = expression.expression 1201 return self.func( 1202 "MOD", 1203 this.unnest() if isinstance(this, exp.Paren) else this, 1204 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1205 ) 1206 1207 def column_parts(self, expression: exp.Column) -> str: 1208 if expression.meta.get("quoted_column"): 1209 # If a column reference is of the form `dataset.table`.name, we need 1210 # to preserve the quoted table path, otherwise the reference breaks 1211 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1212 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1213 return f"{table_path}.{self.sql(expression, 'this')}" 1214 1215 return super().column_parts(expression) 1216 1217 def table_parts(self, expression: exp.Table) -> str: 1218 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1219 # we need to make sure the correct quoting is used in each case. 1220 # 1221 # For example, if there is a CTE x that clashes with a schema name, then the former will 1222 # return the table y in that schema, whereas the latter will return the CTE's y column: 1223 # 1224 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1225 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1226 if expression.meta.get("quoted_table"): 1227 table_parts = ".".join(p.name for p in expression.parts) 1228 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1229 1230 return super().table_parts(expression) 1231 1232 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1233 this = expression.this 1234 if isinstance(this, exp.TsOrDsToDatetime): 1235 func_name = "FORMAT_DATETIME" 1236 elif isinstance(this, exp.TsOrDsToTimestamp): 1237 func_name = "FORMAT_TIMESTAMP" 1238 else: 1239 func_name = "FORMAT_DATE" 1240 1241 time_expr = ( 1242 this 1243 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1244 else expression 1245 ) 1246 return self.func( 1247 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1248 ) 1249 1250 def eq_sql(self, expression: exp.EQ) -> str: 1251 # Operands of = cannot be NULL in BigQuery 1252 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1253 if not isinstance(expression.parent, exp.Update): 1254 return "NULL" 1255 1256 return self.binary(expression, "=") 1257 1258 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1259 parent = expression.parent 1260 1261 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1262 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1263 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1264 return self.func( 1265 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1266 ) 1267 1268 return super().attimezone_sql(expression) 1269 1270 def trycast_sql(self, expression: exp.TryCast) -> str: 1271 return self.cast_sql(expression, safe_prefix="SAFE_") 1272 1273 def bracket_sql(self, expression: exp.Bracket) -> str: 1274 this = expression.this 1275 expressions = expression.expressions 1276 1277 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1278 arg = expressions[0] 1279 if arg.type is None: 1280 from sqlglot.optimizer.annotate_types import annotate_types 1281 1282 arg = annotate_types(arg, dialect=self.dialect) 1283 1284 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1285 # BQ doesn't support bracket syntax with string values for structs 1286 return f"{self.sql(this)}.{arg.name}" 1287 1288 expressions_sql = self.expressions(expression, flat=True) 1289 offset = expression.args.get("offset") 1290 1291 if offset == 0: 1292 expressions_sql = f"OFFSET({expressions_sql})" 1293 elif offset == 1: 1294 expressions_sql = f"ORDINAL({expressions_sql})" 1295 elif offset is not None: 1296 self.unsupported(f"Unsupported array offset: {offset}") 1297 1298 if expression.args.get("safe"): 1299 expressions_sql = f"SAFE_{expressions_sql}" 1300 1301 return f"{self.sql(this)}[{expressions_sql}]" 1302 1303 def in_unnest_op(self, expression: exp.Unnest) -> str: 1304 return self.sql(expression) 1305 1306 def version_sql(self, expression: exp.Version) -> str: 1307 if expression.name == "TIMESTAMP": 1308 expression.set("this", "SYSTEM_TIME") 1309 return super().version_sql(expression) 1310 1311 def contains_sql(self, expression: exp.Contains) -> str: 1312 this = expression.this 1313 expr = expression.expression 1314 1315 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1316 this = this.this 1317 expr = expr.this 1318 1319 return self.func("CONTAINS_SUBSTR", this, expr) 1320 1321 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1322 this = expression.this 1323 1324 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1325 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1326 # because they aren't literals and so the above syntax is invalid BigQuery. 1327 if isinstance(this, exp.Array): 1328 elem = seq_get(this.expressions, 0) 1329 if not (elem and elem.find(exp.Query)): 1330 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1331 1332 return super().cast_sql(expression, safe_prefix=safe_prefix)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
453 def normalize_identifier(self, expression: E) -> E: 454 if ( 455 isinstance(expression, exp.Identifier) 456 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 457 ): 458 parent = expression.parent 459 while isinstance(parent, exp.Dot): 460 parent = parent.parent 461 462 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 463 # by default. The following check uses a heuristic to detect tables based on whether 464 # they are qualified. This should generally be correct, because tables in BigQuery 465 # must be qualified with at least a dataset, unless @@dataset_id is set. 466 case_sensitive = ( 467 isinstance(parent, exp.UserDefinedFunction) 468 or ( 469 isinstance(parent, exp.Table) 470 and parent.db 471 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 472 ) 473 or expression.meta.get("is_table") 474 ) 475 if not case_sensitive: 476 expression.set("this", expression.this.lower()) 477 478 return t.cast(E, expression) 479 480 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
482 class Tokenizer(tokens.Tokenizer): 483 QUOTES = ["'", '"', '"""', "'''"] 484 COMMENTS = ["--", "#", ("/*", "*/")] 485 IDENTIFIERS = ["`"] 486 STRING_ESCAPES = ["\\"] 487 488 HEX_STRINGS = [("0x", ""), ("0X", "")] 489 490 BYTE_STRINGS = [ 491 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 492 ] 493 494 RAW_STRINGS = [ 495 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 496 ] 497 498 NESTED_COMMENTS = False 499 500 KEYWORDS = { 501 **tokens.Tokenizer.KEYWORDS, 502 "ANY TYPE": TokenType.VARIANT, 503 "BEGIN": TokenType.COMMAND, 504 "BEGIN TRANSACTION": TokenType.BEGIN, 505 "BYTEINT": TokenType.INT, 506 "BYTES": TokenType.BINARY, 507 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 508 "DATETIME": TokenType.TIMESTAMP, 509 "DECLARE": TokenType.COMMAND, 510 "ELSEIF": TokenType.COMMAND, 511 "EXCEPTION": TokenType.COMMAND, 512 "EXPORT": TokenType.EXPORT, 513 "FLOAT64": TokenType.DOUBLE, 514 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 515 "MODEL": TokenType.MODEL, 516 "NOT DETERMINISTIC": TokenType.VOLATILE, 517 "RECORD": TokenType.STRUCT, 518 "TIMESTAMP": TokenType.TIMESTAMPTZ, 519 } 520 KEYWORDS.pop("DIV") 521 KEYWORDS.pop("VALUES") 522 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
524 class Parser(parser.Parser): 525 PREFIXED_PIVOT_COLUMNS = True 526 LOG_DEFAULTS_TO_LN = True 527 SUPPORTS_IMPLICIT_UNNEST = True 528 529 # BigQuery does not allow ASC/DESC to be used as an identifier 530 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 531 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 532 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 533 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 534 TokenType.ASC, 535 TokenType.DESC, 536 } 537 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 538 539 FUNCTIONS = { 540 **parser.Parser.FUNCTIONS, 541 "CONTAINS_SUBSTR": _build_contains_substring, 542 "DATE": _build_date, 543 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 544 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 545 "DATE_TRUNC": lambda args: exp.DateTrunc( 546 unit=exp.Literal.string(str(seq_get(args, 1))), 547 this=seq_get(args, 0), 548 zone=seq_get(args, 2), 549 ), 550 "DATETIME": _build_datetime, 551 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 552 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 553 "DIV": binary_from_function(exp.IntDiv), 554 "EDIT_DISTANCE": _build_levenshtein, 555 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 556 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 557 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 558 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 559 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 560 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 561 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 562 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 563 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 564 "MD5": exp.MD5Digest.from_arg_list, 565 "TO_HEX": _build_to_hex, 566 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 567 [seq_get(args, 1), seq_get(args, 0)] 568 ), 569 "PARSE_TIMESTAMP": _build_parse_timestamp, 570 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 571 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 572 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 573 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 574 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 575 ), 576 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 577 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 578 "SPLIT": lambda args: exp.Split( 579 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 580 this=seq_get(args, 0), 581 expression=seq_get(args, 1) or exp.Literal.string(","), 582 ), 583 "STRPOS": exp.StrPosition.from_arg_list, 584 "TIME": _build_time, 585 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 586 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 587 "TIMESTAMP": _build_timestamp, 588 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 589 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 590 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 591 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 592 ), 593 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 594 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 595 ), 596 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 597 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 598 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 599 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 600 } 601 602 FUNCTION_PARSERS = { 603 **parser.Parser.FUNCTION_PARSERS, 604 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 605 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 606 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 607 } 608 FUNCTION_PARSERS.pop("TRIM") 609 610 NO_PAREN_FUNCTIONS = { 611 **parser.Parser.NO_PAREN_FUNCTIONS, 612 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 613 } 614 615 NESTED_TYPE_TOKENS = { 616 *parser.Parser.NESTED_TYPE_TOKENS, 617 TokenType.TABLE, 618 } 619 620 PROPERTY_PARSERS = { 621 **parser.Parser.PROPERTY_PARSERS, 622 "NOT DETERMINISTIC": lambda self: self.expression( 623 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 624 ), 625 "OPTIONS": lambda self: self._parse_with_property(), 626 } 627 628 CONSTRAINT_PARSERS = { 629 **parser.Parser.CONSTRAINT_PARSERS, 630 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 631 } 632 633 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 634 RANGE_PARSERS.pop(TokenType.OVERLAPS) 635 636 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 637 638 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 639 640 STATEMENT_PARSERS = { 641 **parser.Parser.STATEMENT_PARSERS, 642 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 643 TokenType.END: lambda self: self._parse_as_command(self._prev), 644 TokenType.FOR: lambda self: self._parse_for_in(), 645 TokenType.EXPORT: lambda self: self._parse_export_data(), 646 } 647 648 BRACKET_OFFSETS = { 649 "OFFSET": (0, False), 650 "ORDINAL": (1, False), 651 "SAFE_OFFSET": (0, True), 652 "SAFE_ORDINAL": (1, True), 653 } 654 655 def _parse_for_in(self) -> exp.ForIn: 656 this = self._parse_range() 657 self._match_text_seq("DO") 658 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 659 660 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 661 this = super()._parse_table_part(schema=schema) or self._parse_number() 662 663 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 664 if isinstance(this, exp.Identifier): 665 table_name = this.name 666 while self._match(TokenType.DASH, advance=False) and self._next: 667 start = self._curr 668 while self._is_connected() and not self._match_set( 669 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 670 ): 671 self._advance() 672 673 if start == self._curr: 674 break 675 676 table_name += self._find_sql(start, self._prev) 677 678 this = exp.Identifier( 679 this=table_name, quoted=this.args.get("quoted") 680 ).update_positions(this) 681 elif isinstance(this, exp.Literal): 682 table_name = this.name 683 684 if self._is_connected() and self._parse_var(any_token=True): 685 table_name += self._prev.text 686 687 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 688 689 return this 690 691 def _parse_table_parts( 692 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 693 ) -> exp.Table: 694 table = super()._parse_table_parts( 695 schema=schema, is_db_reference=is_db_reference, wildcard=True 696 ) 697 698 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 699 if not table.catalog: 700 if table.db: 701 previous_db = table.args["db"] 702 parts = table.db.split(".") 703 if len(parts) == 2 and not table.args["db"].quoted: 704 table.set( 705 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 706 ) 707 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 708 else: 709 previous_this = table.this 710 parts = table.name.split(".") 711 if len(parts) == 2 and not table.this.quoted: 712 table.set( 713 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 714 ) 715 table.set( 716 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 717 ) 718 719 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 720 alias = table.this 721 catalog, db, this, *rest = ( 722 exp.to_identifier(p, quoted=True) 723 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 724 ) 725 726 for part in (catalog, db, this): 727 if part: 728 part.update_positions(table.this) 729 730 if rest and this: 731 this = exp.Dot.build([this, *rest]) # type: ignore 732 733 table = exp.Table( 734 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 735 ) 736 table.meta["quoted_table"] = True 737 else: 738 alias = None 739 740 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 741 # dataset, so if the project identifier is omitted we need to fix the ast so that 742 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 743 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 744 # views, because it would seem like the "catalog" part is set, when it'd actually 745 # be the region/dataset. Merging the two identifiers into a single one is done to 746 # avoid producing a 4-part Table reference, which would cause issues in the schema 747 # module, when there are 3-part table names mixed with information schema views. 748 # 749 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 750 table_parts = table.parts 751 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 752 # We need to alias the table here to avoid breaking existing qualified columns. 753 # This is expected to be safe, because if there's an actual alias coming up in 754 # the token stream, it will overwrite this one. If there isn't one, we are only 755 # exposing the name that can be used to reference the view explicitly (a no-op). 756 exp.alias_( 757 table, 758 t.cast(exp.Identifier, alias or table_parts[-1]), 759 table=True, 760 copy=False, 761 ) 762 763 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 764 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 765 line=table_parts[-2].meta.get("line"), 766 col=table_parts[-1].meta.get("col"), 767 start=table_parts[-2].meta.get("start"), 768 end=table_parts[-1].meta.get("end"), 769 ) 770 table.set("this", new_this) 771 table.set("db", seq_get(table_parts, -3)) 772 table.set("catalog", seq_get(table_parts, -4)) 773 774 return table 775 776 def _parse_column(self) -> t.Optional[exp.Expression]: 777 column = super()._parse_column() 778 if isinstance(column, exp.Column): 779 parts = column.parts 780 if any("." in p.name for p in parts): 781 catalog, db, table, this, *rest = ( 782 exp.to_identifier(p, quoted=True) 783 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 784 ) 785 786 if rest and this: 787 this = exp.Dot.build([this, *rest]) # type: ignore 788 789 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 790 column.meta["quoted_column"] = True 791 792 return column 793 794 @t.overload 795 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 796 797 @t.overload 798 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 799 800 def _parse_json_object(self, agg=False): 801 json_object = super()._parse_json_object() 802 array_kv_pair = seq_get(json_object.expressions, 0) 803 804 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 805 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 806 if ( 807 array_kv_pair 808 and isinstance(array_kv_pair.this, exp.Array) 809 and isinstance(array_kv_pair.expression, exp.Array) 810 ): 811 keys = array_kv_pair.this.expressions 812 values = array_kv_pair.expression.expressions 813 814 json_object.set( 815 "expressions", 816 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 817 ) 818 819 return json_object 820 821 def _parse_bracket( 822 self, this: t.Optional[exp.Expression] = None 823 ) -> t.Optional[exp.Expression]: 824 bracket = super()._parse_bracket(this) 825 826 if this is bracket: 827 return bracket 828 829 if isinstance(bracket, exp.Bracket): 830 for expression in bracket.expressions: 831 name = expression.name.upper() 832 833 if name not in self.BRACKET_OFFSETS: 834 break 835 836 offset, safe = self.BRACKET_OFFSETS[name] 837 bracket.set("offset", offset) 838 bracket.set("safe", safe) 839 expression.replace(expression.expressions[0]) 840 841 return bracket 842 843 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 844 unnest = super()._parse_unnest(with_alias=with_alias) 845 846 if not unnest: 847 return None 848 849 unnest_expr = seq_get(unnest.expressions, 0) 850 if unnest_expr: 851 from sqlglot.optimizer.annotate_types import annotate_types 852 853 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 854 855 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 856 # in contrast to other dialects such as DuckDB which flattens only the array by default 857 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 858 array_elem.is_type(exp.DataType.Type.STRUCT) 859 for array_elem in unnest_expr._type.expressions 860 ): 861 unnest.set("explode_array", True) 862 863 return unnest 864 865 def _parse_make_interval(self) -> exp.MakeInterval: 866 expr = exp.MakeInterval() 867 868 for arg_key in expr.arg_types: 869 value = self._parse_lambda() 870 871 if not value: 872 break 873 874 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 875 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 876 if isinstance(value, exp.Kwarg): 877 arg_key = value.this.name 878 879 expr.set(arg_key, value) 880 881 self._match(TokenType.COMMA) 882 883 return expr 884 885 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 886 expr = self.expression( 887 exp.FeaturesAtTime, 888 this=(self._match(TokenType.TABLE) and self._parse_table()) 889 or self._parse_select(nested=True), 890 ) 891 892 while self._match(TokenType.COMMA): 893 arg = self._parse_lambda() 894 895 # Get the LHS of the Kwarg and set the arg to that value, e.g 896 # "num_rows => 1" sets the expr's `num_rows` arg 897 if arg: 898 expr.set(arg.this.name, arg) 899 900 return expr 901 902 def _parse_export_data(self) -> exp.Export: 903 self._match_text_seq("DATA") 904 905 return self.expression( 906 exp.Export, 907 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 908 options=self._parse_properties(), 909 this=self._match_text_seq("AS") and self._parse_select(), 910 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- errors
- sql
912 class Generator(generator.Generator): 913 INTERVAL_ALLOWS_PLURAL_FORM = False 914 JOIN_HINTS = False 915 QUERY_HINTS = False 916 TABLE_HINTS = False 917 LIMIT_FETCH = "LIMIT" 918 RENAME_TABLE_WITH_DB = False 919 NVL2_SUPPORTED = False 920 UNNEST_WITH_ORDINALITY = False 921 COLLATE_IS_FUNC = True 922 LIMIT_ONLY_LITERALS = True 923 SUPPORTS_TABLE_ALIAS_COLUMNS = False 924 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 925 JSON_KEY_VALUE_PAIR_SEP = "," 926 NULL_ORDERING_SUPPORTED = False 927 IGNORE_NULLS_IN_FUNC = True 928 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 929 CAN_IMPLEMENT_ARRAY_ANY = True 930 SUPPORTS_TO_NUMBER = False 931 NAMED_PLACEHOLDER_TOKEN = "@" 932 HEX_FUNC = "TO_HEX" 933 WITH_PROPERTIES_PREFIX = "OPTIONS" 934 SUPPORTS_EXPLODING_PROJECTIONS = False 935 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 936 SUPPORTS_UNIX_SECONDS = True 937 938 TRANSFORMS = { 939 **generator.Generator.TRANSFORMS, 940 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 941 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 942 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 943 exp.Array: inline_array_unless_query, 944 exp.ArrayContains: _array_contains_sql, 945 exp.ArrayFilter: filter_array_using_unnest, 946 exp.ArrayRemove: filter_array_using_unnest, 947 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 948 exp.CollateProperty: lambda self, e: ( 949 f"DEFAULT COLLATE {self.sql(e, 'this')}" 950 if e.args.get("default") 951 else f"COLLATE {self.sql(e, 'this')}" 952 ), 953 exp.Commit: lambda *_: "COMMIT TRANSACTION", 954 exp.CountIf: rename_func("COUNTIF"), 955 exp.Create: _create_sql, 956 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 957 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 958 exp.DateDiff: lambda self, e: self.func( 959 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 960 ), 961 exp.DateFromParts: rename_func("DATE"), 962 exp.DateStrToDate: datestrtodate_sql, 963 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 964 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 965 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 966 exp.DateTrunc: lambda self, e: self.func( 967 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 968 ), 969 exp.FromTimeZone: lambda self, e: self.func( 970 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 971 ), 972 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 973 exp.GroupConcat: lambda self, e: groupconcat_sql( 974 self, e, func_name="STRING_AGG", within_group=False 975 ), 976 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 977 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 978 exp.If: if_sql(false_value="NULL"), 979 exp.ILike: no_ilike_sql, 980 exp.IntDiv: rename_func("DIV"), 981 exp.Int64: rename_func("INT64"), 982 exp.JSONExtract: _json_extract_sql, 983 exp.JSONExtractArray: _json_extract_sql, 984 exp.JSONExtractScalar: _json_extract_sql, 985 exp.JSONFormat: rename_func("TO_JSON_STRING"), 986 exp.Levenshtein: _levenshtein_sql, 987 exp.Max: max_or_greatest, 988 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 989 exp.MD5Digest: rename_func("MD5"), 990 exp.Min: min_or_least, 991 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 992 exp.RegexpExtract: lambda self, e: self.func( 993 "REGEXP_EXTRACT", 994 e.this, 995 e.expression, 996 e.args.get("position"), 997 e.args.get("occurrence"), 998 ), 999 exp.RegexpExtractAll: lambda self, e: self.func( 1000 "REGEXP_EXTRACT_ALL", e.this, e.expression 1001 ), 1002 exp.RegexpReplace: regexp_replace_sql, 1003 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1004 exp.ReturnsProperty: _returnsproperty_sql, 1005 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1006 exp.Select: transforms.preprocess( 1007 [ 1008 transforms.explode_projection_to_unnest(), 1009 transforms.unqualify_unnest, 1010 transforms.eliminate_distinct_on, 1011 _alias_ordered_group, 1012 transforms.eliminate_semi_and_anti_joins, 1013 ] 1014 ), 1015 exp.SHA: rename_func("SHA1"), 1016 exp.SHA2: sha256_sql, 1017 exp.StabilityProperty: lambda self, e: ( 1018 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1019 ), 1020 exp.String: rename_func("STRING"), 1021 exp.StrPosition: lambda self, e: ( 1022 strposition_sql( 1023 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1024 ) 1025 ), 1026 exp.StrToDate: _str_to_datetime_sql, 1027 exp.StrToTime: _str_to_datetime_sql, 1028 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1029 exp.TimeFromParts: rename_func("TIME"), 1030 exp.TimestampFromParts: rename_func("DATETIME"), 1031 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1032 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1033 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1034 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1035 exp.TimeStrToTime: timestrtotime_sql, 1036 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1037 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1038 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1039 exp.TsOrDsToTime: rename_func("TIME"), 1040 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1041 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1042 exp.Unhex: rename_func("FROM_HEX"), 1043 exp.UnixDate: rename_func("UNIX_DATE"), 1044 exp.UnixToTime: _unix_to_time_sql, 1045 exp.Uuid: lambda *_: "GENERATE_UUID()", 1046 exp.Values: _derived_table_values_to_unnest, 1047 exp.VariancePop: rename_func("VAR_POP"), 1048 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1049 } 1050 1051 SUPPORTED_JSON_PATH_PARTS = { 1052 exp.JSONPathKey, 1053 exp.JSONPathRoot, 1054 exp.JSONPathSubscript, 1055 } 1056 1057 TYPE_MAPPING = { 1058 **generator.Generator.TYPE_MAPPING, 1059 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1060 exp.DataType.Type.BIGINT: "INT64", 1061 exp.DataType.Type.BINARY: "BYTES", 1062 exp.DataType.Type.BLOB: "BYTES", 1063 exp.DataType.Type.BOOLEAN: "BOOL", 1064 exp.DataType.Type.CHAR: "STRING", 1065 exp.DataType.Type.DECIMAL: "NUMERIC", 1066 exp.DataType.Type.DOUBLE: "FLOAT64", 1067 exp.DataType.Type.FLOAT: "FLOAT64", 1068 exp.DataType.Type.INT: "INT64", 1069 exp.DataType.Type.NCHAR: "STRING", 1070 exp.DataType.Type.NVARCHAR: "STRING", 1071 exp.DataType.Type.SMALLINT: "INT64", 1072 exp.DataType.Type.TEXT: "STRING", 1073 exp.DataType.Type.TIMESTAMP: "DATETIME", 1074 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1075 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1076 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1077 exp.DataType.Type.TINYINT: "INT64", 1078 exp.DataType.Type.ROWVERSION: "BYTES", 1079 exp.DataType.Type.UUID: "STRING", 1080 exp.DataType.Type.VARBINARY: "BYTES", 1081 exp.DataType.Type.VARCHAR: "STRING", 1082 exp.DataType.Type.VARIANT: "ANY TYPE", 1083 } 1084 1085 PROPERTIES_LOCATION = { 1086 **generator.Generator.PROPERTIES_LOCATION, 1087 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1088 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1089 } 1090 1091 # WINDOW comes after QUALIFY 1092 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1093 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1094 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1095 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1096 } 1097 1098 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1099 RESERVED_KEYWORDS = { 1100 "all", 1101 "and", 1102 "any", 1103 "array", 1104 "as", 1105 "asc", 1106 "assert_rows_modified", 1107 "at", 1108 "between", 1109 "by", 1110 "case", 1111 "cast", 1112 "collate", 1113 "contains", 1114 "create", 1115 "cross", 1116 "cube", 1117 "current", 1118 "default", 1119 "define", 1120 "desc", 1121 "distinct", 1122 "else", 1123 "end", 1124 "enum", 1125 "escape", 1126 "except", 1127 "exclude", 1128 "exists", 1129 "extract", 1130 "false", 1131 "fetch", 1132 "following", 1133 "for", 1134 "from", 1135 "full", 1136 "group", 1137 "grouping", 1138 "groups", 1139 "hash", 1140 "having", 1141 "if", 1142 "ignore", 1143 "in", 1144 "inner", 1145 "intersect", 1146 "interval", 1147 "into", 1148 "is", 1149 "join", 1150 "lateral", 1151 "left", 1152 "like", 1153 "limit", 1154 "lookup", 1155 "merge", 1156 "natural", 1157 "new", 1158 "no", 1159 "not", 1160 "null", 1161 "nulls", 1162 "of", 1163 "on", 1164 "or", 1165 "order", 1166 "outer", 1167 "over", 1168 "partition", 1169 "preceding", 1170 "proto", 1171 "qualify", 1172 "range", 1173 "recursive", 1174 "respect", 1175 "right", 1176 "rollup", 1177 "rows", 1178 "select", 1179 "set", 1180 "some", 1181 "struct", 1182 "tablesample", 1183 "then", 1184 "to", 1185 "treat", 1186 "true", 1187 "unbounded", 1188 "union", 1189 "unnest", 1190 "using", 1191 "when", 1192 "where", 1193 "window", 1194 "with", 1195 "within", 1196 } 1197 1198 def mod_sql(self, expression: exp.Mod) -> str: 1199 this = expression.this 1200 expr = expression.expression 1201 return self.func( 1202 "MOD", 1203 this.unnest() if isinstance(this, exp.Paren) else this, 1204 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1205 ) 1206 1207 def column_parts(self, expression: exp.Column) -> str: 1208 if expression.meta.get("quoted_column"): 1209 # If a column reference is of the form `dataset.table`.name, we need 1210 # to preserve the quoted table path, otherwise the reference breaks 1211 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1212 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1213 return f"{table_path}.{self.sql(expression, 'this')}" 1214 1215 return super().column_parts(expression) 1216 1217 def table_parts(self, expression: exp.Table) -> str: 1218 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1219 # we need to make sure the correct quoting is used in each case. 1220 # 1221 # For example, if there is a CTE x that clashes with a schema name, then the former will 1222 # return the table y in that schema, whereas the latter will return the CTE's y column: 1223 # 1224 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1225 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1226 if expression.meta.get("quoted_table"): 1227 table_parts = ".".join(p.name for p in expression.parts) 1228 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1229 1230 return super().table_parts(expression) 1231 1232 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1233 this = expression.this 1234 if isinstance(this, exp.TsOrDsToDatetime): 1235 func_name = "FORMAT_DATETIME" 1236 elif isinstance(this, exp.TsOrDsToTimestamp): 1237 func_name = "FORMAT_TIMESTAMP" 1238 else: 1239 func_name = "FORMAT_DATE" 1240 1241 time_expr = ( 1242 this 1243 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1244 else expression 1245 ) 1246 return self.func( 1247 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1248 ) 1249 1250 def eq_sql(self, expression: exp.EQ) -> str: 1251 # Operands of = cannot be NULL in BigQuery 1252 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1253 if not isinstance(expression.parent, exp.Update): 1254 return "NULL" 1255 1256 return self.binary(expression, "=") 1257 1258 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1259 parent = expression.parent 1260 1261 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1262 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1263 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1264 return self.func( 1265 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1266 ) 1267 1268 return super().attimezone_sql(expression) 1269 1270 def trycast_sql(self, expression: exp.TryCast) -> str: 1271 return self.cast_sql(expression, safe_prefix="SAFE_") 1272 1273 def bracket_sql(self, expression: exp.Bracket) -> str: 1274 this = expression.this 1275 expressions = expression.expressions 1276 1277 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1278 arg = expressions[0] 1279 if arg.type is None: 1280 from sqlglot.optimizer.annotate_types import annotate_types 1281 1282 arg = annotate_types(arg, dialect=self.dialect) 1283 1284 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1285 # BQ doesn't support bracket syntax with string values for structs 1286 return f"{self.sql(this)}.{arg.name}" 1287 1288 expressions_sql = self.expressions(expression, flat=True) 1289 offset = expression.args.get("offset") 1290 1291 if offset == 0: 1292 expressions_sql = f"OFFSET({expressions_sql})" 1293 elif offset == 1: 1294 expressions_sql = f"ORDINAL({expressions_sql})" 1295 elif offset is not None: 1296 self.unsupported(f"Unsupported array offset: {offset}") 1297 1298 if expression.args.get("safe"): 1299 expressions_sql = f"SAFE_{expressions_sql}" 1300 1301 return f"{self.sql(this)}[{expressions_sql}]" 1302 1303 def in_unnest_op(self, expression: exp.Unnest) -> str: 1304 return self.sql(expression) 1305 1306 def version_sql(self, expression: exp.Version) -> str: 1307 if expression.name == "TIMESTAMP": 1308 expression.set("this", "SYSTEM_TIME") 1309 return super().version_sql(expression) 1310 1311 def contains_sql(self, expression: exp.Contains) -> str: 1312 this = expression.this 1313 expr = expression.expression 1314 1315 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1316 this = this.this 1317 expr = expr.this 1318 1319 return self.func("CONTAINS_SUBSTR", this, expr) 1320 1321 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1322 this = expression.this 1323 1324 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1325 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1326 # because they aren't literals and so the above syntax is invalid BigQuery. 1327 if isinstance(this, exp.Array): 1328 elem = seq_get(this.expressions, 0) 1329 if not (elem and elem.find(exp.Query)): 1330 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1331 1332 return super().cast_sql(expression, safe_prefix=safe_prefix)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1207 def column_parts(self, expression: exp.Column) -> str: 1208 if expression.meta.get("quoted_column"): 1209 # If a column reference is of the form `dataset.table`.name, we need 1210 # to preserve the quoted table path, otherwise the reference breaks 1211 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1212 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1213 return f"{table_path}.{self.sql(expression, 'this')}" 1214 1215 return super().column_parts(expression)
1217 def table_parts(self, expression: exp.Table) -> str: 1218 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1219 # we need to make sure the correct quoting is used in each case. 1220 # 1221 # For example, if there is a CTE x that clashes with a schema name, then the former will 1222 # return the table y in that schema, whereas the latter will return the CTE's y column: 1223 # 1224 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1225 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1226 if expression.meta.get("quoted_table"): 1227 table_parts = ".".join(p.name for p in expression.parts) 1228 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1229 1230 return super().table_parts(expression)
1232 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1233 this = expression.this 1234 if isinstance(this, exp.TsOrDsToDatetime): 1235 func_name = "FORMAT_DATETIME" 1236 elif isinstance(this, exp.TsOrDsToTimestamp): 1237 func_name = "FORMAT_TIMESTAMP" 1238 else: 1239 func_name = "FORMAT_DATE" 1240 1241 time_expr = ( 1242 this 1243 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1244 else expression 1245 ) 1246 return self.func( 1247 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1248 )
1250 def eq_sql(self, expression: exp.EQ) -> str: 1251 # Operands of = cannot be NULL in BigQuery 1252 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1253 if not isinstance(expression.parent, exp.Update): 1254 return "NULL" 1255 1256 return self.binary(expression, "=")
1258 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1259 parent = expression.parent 1260 1261 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1262 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1263 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1264 return self.func( 1265 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1266 ) 1267 1268 return super().attimezone_sql(expression)
1273 def bracket_sql(self, expression: exp.Bracket) -> str: 1274 this = expression.this 1275 expressions = expression.expressions 1276 1277 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1278 arg = expressions[0] 1279 if arg.type is None: 1280 from sqlglot.optimizer.annotate_types import annotate_types 1281 1282 arg = annotate_types(arg, dialect=self.dialect) 1283 1284 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1285 # BQ doesn't support bracket syntax with string values for structs 1286 return f"{self.sql(this)}.{arg.name}" 1287 1288 expressions_sql = self.expressions(expression, flat=True) 1289 offset = expression.args.get("offset") 1290 1291 if offset == 0: 1292 expressions_sql = f"OFFSET({expressions_sql})" 1293 elif offset == 1: 1294 expressions_sql = f"ORDINAL({expressions_sql})" 1295 elif offset is not None: 1296 self.unsupported(f"Unsupported array offset: {offset}") 1297 1298 if expression.args.get("safe"): 1299 expressions_sql = f"SAFE_{expressions_sql}" 1300 1301 return f"{self.sql(this)}[{expressions_sql}]"
1321 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1322 this = expression.this 1323 1324 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1325 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1326 # because they aren't literals and so the above syntax is invalid BigQuery. 1327 if isinstance(this, exp.Array): 1328 elem = seq_get(this.expressions, 0) 1329 if not (elem and elem.find(exp.Query)): 1330 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1331 1332 return super().cast_sql(expression, safe_prefix=safe_prefix)
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- featuresattime_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql