5454import google .api_core .gapic_v1 .client_info
5555import google .auth .credentials
5656import google .cloud .bigquery as bigquery
57+ import google .cloud .bigquery .table
5758import google .cloud .bigquery_connection_v1
5859import google .cloud .bigquery_storage_v1
5960import google .cloud .functions_v2
@@ -693,7 +694,7 @@ def read_gbq_table(
693694
694695 def _get_snapshot_sql_and_primary_key (
695696 self ,
696- table_ref : bigquery .table .TableReference ,
697+ table : google . cloud . bigquery .table .Table ,
697698 * ,
698699 api_name : str ,
699700 use_cache : bool = True ,
@@ -709,7 +710,7 @@ def _get_snapshot_sql_and_primary_key(
709710 table ,
710711 ) = bigframes_io .get_snapshot_datetime_and_table_metadata (
711712 self .bqclient ,
712- table_ref = table_ref ,
713+ table_ref = table . reference ,
713714 api_name = api_name ,
714715 cache = self ._df_snapshot ,
715716 use_cache = use_cache ,
@@ -735,7 +736,7 @@ def _get_snapshot_sql_and_primary_key(
735736
736737 try :
737738 table_expression = self .ibis_client .sql (
738- bigframes_io .create_snapshot_sql (table_ref , snapshot_timestamp )
739+ bigframes_io .create_snapshot_sql (table . reference , snapshot_timestamp )
739740 )
740741 except google .api_core .exceptions .Forbidden as ex :
741742 if "Drive credentials" in ex .message :
@@ -763,8 +764,9 @@ def _read_gbq_table(
763764 query , default_project = self .bqclient .project
764765 )
765766
767+ table = self .bqclient .get_table (table_ref )
766768 (table_expression , primary_keys ,) = self ._get_snapshot_sql_and_primary_key (
767- table_ref , api_name = api_name , use_cache = use_cache
769+ table , api_name = api_name , use_cache = use_cache
768770 )
769771 total_ordering_cols = primary_keys
770772
@@ -836,9 +838,13 @@ def _read_gbq_table(
836838 ordering = ordering ,
837839 )
838840 else :
839- array_value = self ._create_total_ordering (table_expression )
841+ array_value = self ._create_total_ordering (
842+ table_expression , table_rows = table .num_rows
843+ )
840844 else :
841- array_value = self ._create_total_ordering (table_expression )
845+ array_value = self ._create_total_ordering (
846+ table_expression , table_rows = table .num_rows
847+ )
842848
843849 value_columns = [col for col in array_value .column_ids if col not in index_cols ]
844850 block = blocks .Block (
@@ -1459,10 +1465,19 @@ def _create_empty_temp_table(
14591465 def _create_total_ordering (
14601466 self ,
14611467 table : ibis_types .Table ,
1468+ table_rows : Optional [int ],
14621469 ) -> core .ArrayValue :
14631470 # Since this might also be used as the index, don't use the default
14641471 # "ordering ID" name.
1472+
1473+ # For small tables, 64 bits is enough to avoid collisions, 128 bits will never ever collide no matter what
1474+ # Assume table is large if table row count is unknown
1475+ use_double_hash = (
1476+ (table_rows is None ) or (table_rows == 0 ) or (table_rows > 100000 )
1477+ )
1478+
14651479 ordering_hash_part = guid .generate_guid ("bigframes_ordering_" )
1480+ ordering_hash_part2 = guid .generate_guid ("bigframes_ordering_" )
14661481 ordering_rand_part = guid .generate_guid ("bigframes_ordering_" )
14671482
14681483 # All inputs into hash must be non-null or resulting hash will be null
@@ -1475,25 +1490,30 @@ def _create_total_ordering(
14751490 else str_values [0 ]
14761491 )
14771492 full_row_hash = full_row_str .hash ().name (ordering_hash_part )
1493+ # By modifying value slightly, we get another hash uncorrelated with the first
1494+ full_row_hash_p2 = (full_row_str + "_" ).hash ().name (ordering_hash_part2 )
14781495 # Used to disambiguate between identical rows (which will have identical hash)
14791496 random_value = ibis .random ().name (ordering_rand_part )
14801497
1498+ order_values = (
1499+ [full_row_hash , full_row_hash_p2 , random_value ]
1500+ if use_double_hash
1501+ else [full_row_hash , random_value ]
1502+ )
1503+
14811504 original_column_ids = table .columns
14821505 table_with_ordering = table .select (
1483- itertools .chain (original_column_ids , [ full_row_hash , random_value ] )
1506+ itertools .chain (original_column_ids , order_values )
14841507 )
14851508
1486- ordering_ref1 = order .ascending_over (ordering_hash_part )
1487- ordering_ref2 = order .ascending_over (ordering_rand_part )
14881509 ordering = order .ExpressionOrdering (
1489- ordering_value_columns = (ordering_ref1 , ordering_ref2 ),
1490- total_ordering_columns = frozenset ([ordering_hash_part , ordering_rand_part ]),
1510+ ordering_value_columns = tuple (
1511+ order .ascending_over (col .get_name ()) for col in order_values
1512+ ),
1513+ total_ordering_columns = frozenset (col .get_name () for col in order_values ),
14911514 )
14921515 columns = [table_with_ordering [col ] for col in original_column_ids ]
1493- hidden_columns = [
1494- table_with_ordering [ordering_hash_part ],
1495- table_with_ordering [ordering_rand_part ],
1496- ]
1516+ hidden_columns = [table_with_ordering [col .get_name ()] for col in order_values ]
14971517 return core .ArrayValue .from_ibis (
14981518 self ,
14991519 table_with_ordering ,
0 commit comments