介绍一个快速检查近日redo产生量以及等待判断是否需要增加redo size的脚本

通常我们定位一个数据库的redolog是否需要增加，都是通过观察online redolog的切换频率是否频繁，观察告警是否存在checkpoint incompleted，观察awr中的log file等待时间是否正常来判断是否需要增加日志大小或者日志组数，今天找到一个非常不错的脚本，分析过去几天内平均1小时产生的redo量，lgwr的等待方面的统计，也可以作为评估redo产生量的一个统计。

执行结果类似如下图：

最后列为建议增加的值，还有ckpt的等待，总体的cpu开销，物理读和写消耗，日志等待事件，进程数等统计。

具体脚本内容如下：

set pagesize 50000 linesize 300

col instance_number             format 99                   head 'In|st'
col tim                                                     head 'Period end'
col cpu_sec                     format 999,999,999.9        head 'CPU used|sec'
col phy_reads                   format 999,999,999          head 'Physical|reads'
col phy_writes                  format 999,999,999          head 'Physical|writes'
col cr_served                   format 999,999,999          head 'CR blocks|served'
col current_served              format 999,999,999          head 'CUR blocks|served'
col redo_mb                     format 999,999,999.9        head 'Redo, MB'
col processes                   format 999,999              head 'Proc|esses'
col avg_df_seq                  format 9,999.9              head 'Avg 1|read'
col avg_df_scat                 format 9,999.9              head 'Avg N|read'
col redo_diff_to_md_pct         format 999,999              head 'Redo Diff|to median, %'
col avg_lfpw                    format 999.99               head 'Avg|LFPW'
col avg_log_sync                format 9,999.99             head 'Avg Log|Sync, ms'
col log_ckpt_sec                format 999,999              head 'CKPT|waits, s'
col redo_needed                 format 999,999              head 'Redo to|Add, MB'

compute max of cpu_sec          on instance_number
compute max of phy_reads        on instance_number
compute max of phy_writes       on instance_number
compute max of cr_served        on instance_number
compute max of current_served   on instance_number
compute max of phy_writes       on instance_number
compute max of redo_needed      on instance_number
compute max of log_ckpt_sec     on instance_number
compute max of avg_log_sync     on instance_number
compute max of avg_lfpw         on instance_number
compute max of redo_mb          on instance_number
compute max of processes        on instance_number
compute max of avg_df_seq       on instance_number
compute max of avg_df_scat      on instance_number

break on instance_number skip page

with t_interval as
(
 select /*+ inline */ sysdate-30 begin, sysdate as end from dual
)
select
  stats.dbid                                                                dbid,
  stats.instance_number                                                     instance_number,
  to_char(stats.begin_interval_time, 'YYYYMMDD HH24MI')                     tim,
  stats.cpu_used / 100                                                      cpu_sec,
  stats.phy_reads                                                           phy_reads,
  stats.phy_writes                                                          phy_writes,
  stats.cr_served                                                           cr_served,
  stats.current_served                                                      current_served,
  stats.redo_size / 1024 / 1024                                             redo_mb,
  procs.current_utilization                                                 processes,
--
  waits.df_seq_micro / 1000 / nullif(waits.df_seq_waits,0)                  avg_df_seq,
  waits.df_scat_micro / 1000 / nullif(waits.df_scat_waits,0)                avg_df_scat,
  (stats.redo_size - stats.md_redo_size) * 100 / stats.md_redo_size         redo_diff_to_md_pct,
  stats.redo_write_time*10/stats.redo_writes                                avg_lfpw,
  waits.log_sync_micro/nullif(waits.log_sync_waits, 0) / 1000               avg_log_sync,
  waits.log_ckpt_micro/1e6                                                  log_ckpt_sec,
  ( stats.redo_size /
     ( waits.snap_interval * 86400 ) ) *
   ( waits.log_ckpt_micro/1e6 ) / 1024 / 1024                               redo_needed,
  stats.is_restart
from
  (
   select
     snap_id,
     begin_interval_time,
     snap_interval,
     instance_number,
     dbid,
     log_sync_micro,
     log_sync_waits,
     log_ckpt_micro,
     log_ckpt_waits,
     df_seq_micro,
     df_seq_waits,
     df_scat_micro,
     df_scat_waits,
     direct_micro,
     direct_waits,
     median(log_sync_micro/nullif(log_sync_waits, 0)) over (partition by dbid, instance_number) md_log_sync_micro
   from
   (
      select
        snap_id,
        begin_interval_time,
        instance_number,
        dbid,
        max(snap_interval) snap_interval,
        max(decode(event_name, 'log file sync',                            wait_micro))    log_sync_micro,
        max(decode(event_name, 'log file sync',                            total_waits))   log_sync_waits,
        max(decode(event_name, 'log file switch (checkpoint incomplete)',  wait_micro))    log_ckpt_micro,
        max(decode(event_name, 'log file switch (checkpoint incomplete)',  total_waits))   log_ckpt_waits,
        max(decode(event_name, 'db file sequential read',                  wait_micro))    df_seq_micro,
        max(decode(event_name, 'db file sequential read',                  total_waits))   df_seq_waits,
        max(decode(event_name, 'db file scattered read',                   wait_micro))    df_scat_micro,
        max(decode(event_name, 'db file scattered read',                   total_waits))   df_scat_waits,
        max(decode(event_name, 'direct path read',                         wait_micro))    direct_micro,
        max(decode(event_name, 'direct path read',                         total_waits))   direct_waits
      from
      (
        select
          e.snap_id,
          e.instance_number,
          e.dbid,
          sn.begin_interval_time,
          cast(begin_interval_time as date) - cast(lag(begin_interval_time) over (partition by e.dbid, e.instance_number, e.event_name order by sn.begin_interval_time) as date) snap_interval,
          sn.startup_time,
          e.event_name,
          case when (sn.begin_interval_time >= sn.startup_time and lag(sn.begin_interval_time) over (partition by e.dbid, e.instance_number, e.event_name order by sn.begin_interval_time) < sn.startup_time)
            then e.time_waited_micro
            else e.time_waited_micro - lag(e.time_waited_micro) over (partition by e.dbid, e.instance_number, e.event_name order by sn.begin_interval_time)
          end wait_micro,
          case when (sn.begin_interval_time >= sn.startup_time and lag(sn.begin_interval_time) over (partition by e.dbid, e.instance_number, e.event_name order by sn.begin_interval_time) < sn.startup_time)
            then e.total_waits
            else e.total_waits - lag(e.total_waits) over (partition by e.dbid, e.instance_number, e.event_name order by sn.begin_interval_time)
          end total_waits
        from
          dba_hist_system_event e,
          dba_hist_snapshot     sn,
          t_interval            t
        where
          sn.snap_id = e.snap_id and
          sn.dbid = e.dbid and
          sn.instance_number = e.instance_number and
          sn.begin_interval_time between t.begin and t.end and
          e.event_name in (
            'log file sync',
            'log file switch (checkpoint incomplete)',
            'db file sequential read',
            'db file scattered read',
            'direct path read'
          )
      )
      group by dbid, instance_number, begin_interval_time, snap_id
    )
  ) waits,
  (
    select
      snap_id,
      begin_interval_time,
      instance_number,
      dbid,
      redo_size,
      redo_write_time,
      redo_writes,
      is_restart,
      cpu_used,
      phy_reads,
      phy_reads_cache,
      phy_writes,
      phy_writes_cache,
      cr_served,
      current_served,
      median(redo_size) over (partition by dbid, instance_number) md_redo_size
    from
    (
      select
        snap_id,
        begin_interval_time,
        instance_number,
        dbid,
        max(is_restart) is_restart,
        max(decode(stat_name, 'redo size',                       stat_diff)) redo_size,
        max(decode(stat_name, 'redo write time',                 stat_diff)) redo_write_time,
        max(decode(stat_name, 'redo writes',                     stat_diff)) redo_writes,
        max(decode(stat_name, 'CPU used by this session',        stat_diff)) cpu_used,
        max(decode(stat_name, 'physical read total IO requests', stat_diff)) phy_reads,
        max(decode(stat_name, 'physical reads cache',            stat_diff)) phy_reads_cache,
        max(decode(stat_name, 'physical write total IO requests',stat_diff)) phy_writes,
        max(decode(stat_name, 'physical writes from cache',      stat_diff)) phy_writes_cache,
        max(decode(stat_name, 'gc cr blocks served',             stat_diff)) cr_served,
        max(decode(stat_name, 'gc current blocks served',        stat_diff)) current_served
      from
      (
        select
          stats.snap_id,
          stats.instance_number,
          stats.dbid,
          sn.begin_interval_time,
          sn.startup_time,
          stats.stat_name,
          case when (sn.begin_interval_time >= sn.startup_time and lag(sn.begin_interval_time) over (partition by stats.dbid, stats.instance_number, stats.stat_id order by sn.begin_interval_time) < sn.startup_time)
            then stats.value
            else stats.value - lag(stats.value) over (partition by stats.dbid, stats.instance_number, stats.stat_id order by stats.snap_id)
          end stat_diff,
          case when (sn.begin_interval_time >= sn.startup_time and lag(sn.begin_interval_time) over (partition by stats.dbid, stats.instance_number, stats.stat_id order by sn.begin_interval_time) < sn.startup_time)
            then 'Yes'
          end is_restart
        from
          dba_hist_sysstat   stats,
          dba_hist_snapshot  sn,
          t_interval         t
        where
          sn.snap_id = stats.snap_id and
          sn.dbid = stats.dbid and
          sn.instance_number = stats.instance_number and
          sn.begin_interval_time between t.begin and t.end and
          stats.stat_name in (
            'redo size',
            'redo write time',
            'redo writes',
            'CPU used by this session',
            'physical read total IO requests',
            'physical reads cache',
            'physical write total IO requests',
            'physical writes from cache',
            'gc cr blocks served',
            'gc current blocks served'
          )
      )
      group by dbid, instance_number, begin_interval_time, snap_id
    )
  ) stats,
  (
    select
      stats.snap_id,
      stats.instance_number,
      stats.dbid,
      stats.resource_name,
      stats.current_utilization
    from
      dba_hist_resource_limit stats,
      dba_hist_snapshot       sn,
      t_interval              t
    where
      sn.snap_id = stats.snap_id and
      sn.dbid = stats.dbid and
      sn.instance_number = stats.instance_number and
      sn.begin_interval_time between t.begin and t.end and
      stats.resource_name = 'processes'
  ) procs
where
  waits.dbid = stats.dbid and
  waits.instance_number = stats.instance_number and
  waits.snap_id = stats.snap_id and
  waits.dbid = procs.dbid and
  waits.instance_number = procs.instance_number and
  waits.snap_id = procs.snap_id
order by
 stats.dbid, stats.instance_number, stats.begin_interval_time
;

该脚本转自pythian.

Oracle Redo 数据技术 | Guang Cai Li | 2014-07-29