1.CurrentActivity.sql

set lines 150
set pages 999
clear col

set termout off
set trimout on
set trimspool on

REM
REM Current transactions
REM
REM Will show only last transaction by a user
REM
REM May need to use 786472.1 for better picture
REM of activity

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

col username format a10 wrapped heading “User”
col name format a22 wrapped heading “Undo Segment Name”
col xidusn heading “Undo|Seg #”
col xidslot heading “Undo|Slot #”
col xidsqn heading “Undo|Seq #”
col ubafil heading “File #”
col ubablk heading “Block #”
col start_time format a10 word_wrapped heading “Started”
col status format a8 heading “Status”
col blk format 999,999,999 heading “KBytes”
col used_urec heading “Rows”

spool undoactivity.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## Current Uncommitted Transactions ##############
prompt

select start_time, username, r.name,
ubafil, ubablk, t.status, (used_ublk*p.value)/1024 blk, used_urec
from v$transaction t, v$rollname r, v$session s, v$parameter p
where xidusn=usn
and s.saddr=t.ses_addr
and p.name=’db_block_size’
order by 1;
spool off

set termout on
set trimout off
set trimspool off
clear col

2.LobData.sql

REM List table and column information for LOBs for a specific user
REM
REM UNDO handling with LOBs is not designed for frequent updates
REM Frequent updates are best handled with PCTVERSION at 100
REM This means you must have a lot of space available in the LOB
REM tablespace as all UNDO will be maintained over time.
REM
REM Using RETENTION does not work as expected
REM It is set to UNDO_RETENTION at the time of the creation of the
REM object. It does not change over time as UNDO_RETENTION
REM or auto-tuned undo retention changes.

set pages 999
set lines 110

spool lobdata.out

col column_name format a25 head “Column”
col table_name format a25 head “Table”
col tablespace_name format a25 head “Tablespace”
col pctversion format 999 head “PCTVersion %”
col segment_space_management format a30 head “Space|Mngmnt”
col retention format 999,999,999 head “Retention”

select l.table_name, l.column_name, l.tablespace_name, l.pctversion, l.retention,
t.segment_space_management
from dba_lobs l, dba_tablespaces t
where owner=upper(‘&USER’)
and l.tablespace_name = t.tablespace_name
/

spool off

3. undoconfig.sql

spool UndoConfig.out

ttitle off
set pages 999
set lines 150
set verify off

set termout off
set trimout on
set trimspool on

REM
REM ————————————————————————

REM
REM —————————————————————–
REM

set space 2

REM REPORTING TABLESPACE INFORMATION:
REM
REM This looks at Tablespace Sizing – Total bytes and free bytes
REM

column tablespace_name format a30 heading ‘TS Name’
column sbytes format 9,999,999,999 heading ‘Total MBytes’
column fbytes format 9,999,999,999 heading ‘Free MBytes’
column file_name format a30 heading ‘File Name’
column kount format 999 heading ‘Ext’

compute sum of fbytes on tablespace_name
compute sum of sbytes on tablespace_name
compute sum of sbytes on report
compute sum of fbytes on report

break on tablespace_name skip 2

select a.tablespace_name, a.file_name, round(a.bytes/1024/1024,0) sbytes,
round(sum(b.bytes/1024/1024),0) fbytes, count(*) kount, autoextensible
from dba_data_files a, dba_free_space b
where a.file_id = b.file_id
and a.tablespace_name in (select z.tablespace_name from dba_tablespaces z where retention like ‘%GUARANTEE’)
group by a.tablespace_name, a.file_name, a.bytes, autoextensible
order by a.tablespace_name
/

set linesize 160

REM
REM If you can significantly reduce physical reads by adding incremental
REM data buffers…do it. To determine whether adding data buffers will
REM help, set db_block_lru_statistics = TRUE and
REM db_block_lru_extended_statistics = TRUE in the init.ora parameters.
REM You can determine how many extra hits you would get from memory as
REM opposed to physical I/O from disk. **NOTE: Turning these on will
REM impact performance. One shift of statistics gathering should be enough
REM to get the required information.
REM

REM
REM —————————————————————–
REM

set lines 160

col tablespace_name format a30 heading “Tablespace”
col tb format a15 heading “TB Status”
col df format a10 heading “DF Status”
col extent_management format a15 heading “Extent|Management”
col allocation_type format a8 heading “Type”
col segment_space_management format a7 heading “Auto|Segment”
col retention format a11 heading “Retention|Level”
col autoextensible format a5 heading “Auto?”
col mx format 999,999,999 heading “Max Allowed”

select t.tablespace_name, t.status tb, d.status df,
extent_management, allocation_type, segment_space_management, retention,
autoextensible, (maxbytes/1024/1024) mx
from dba_tablespaces t, dba_data_files d
where t.tablespace_name = d.tablespace_name
and retention like ‘%GUARANTEE’
/

col status format a20 head “Status”
col cnt format 999,999,999 head “How Many?”

select status, count(*) cnt
from dba_rollback_segs
group by status
/

spool off

set termout on
set trimout off
set trimspool off

4.undodatafiles.sql

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

spool undodatafiles.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## DATAFILES ##############
prompt

col retention head “Retention”
col tablespace_name format a30 head “TBSP Name”
col file_id format 999 head “File #”
col a format 999,999,999,999,999 head “Bytes Alloc (MB)”
col b format 999,999,999,999,999 head “Max Bytes Used (MB)”
col autoextensible head “Auto|Ext”
col extent_management head “Ext Mngmnt”
col allocation_type head “Type”
col segment_space_management head “SSM”

select tablespace_name, file_id, sum(bytes)/1024/1024 a,
sum(maxbytes)/1024/1024 b,
autoextensible
from dba_data_files
where tablespace_name in (select tablespace_name from dba_tablespaces
where retention like ‘%GUARANTEE’ )
group by file_id, tablespace_name, autoextensible
order by tablespace_name
/

spool off
set termout on
set trimout off
set trimspool off
clear col

5.undoextends.sql

spool UndoExts.out

ttitle off
set pages 999
set lines 150
set verify off

set termout off
set trimout on
set trimspool on

REM
REM ————————————————————————

REM
REM —————————————————————–
REM

REM
REM REPORTING UNDO EXTENTS INFORMATION:
REM
REM —————————————————————–
REM
REM Undo Extents breakdown information
REM

ttitle center “Rollback Segments Breakdown” skip 2

col status format a20
col cnt format 999,999,999 head “How Many?”

select status, count(*) cnt from dba_rollback_segs
group by status
/

ttitle center “Undo Extents” skip 2

col segment_name format a30 heading “Name”
col “ACT BYTES” format 999,999,999,999 head “Active|Extents”
col “UNEXP BYTES” format 999,999,999,999 head “Unxpired|Extents”
col “EXP BYTES” format 999,999,999,999 head “Expired|Extents”

select segment_name,
nvl(sum(act),0) “ACT BYTES”,
nvl(sum(unexp),0) “UNEXP BYTES”,
nvl(sum(exp),0) “EXP BYTES”
from (
select segment_name,
nvl(sum(bytes),0) act,00 unexp, 00 exp
from DBA_UNDO_EXTENTS
where status=’ACTIVE’ group by segment_name
union
select segment_name,
00 act, nvl(sum(bytes),0) unexp, 00 exp
from DBA_UNDO_EXTENTS
where status=’UNEXPIRED’ group by segment_name
union
select segment_name,
00 act, 00 unexp, nvl(sum(bytes),0) exp
from DBA_UNDO_EXTENTS
where status=’EXPIRED’ group by segment_name
) group by segment_name;

ttitle center “Undo Extents Statistics” skip 2

col size format 999,999,999,999 heading “Size”
col “HOW MANY” format 999,999,999 heading “How Many?”
col st heading a12 heading “Status”

select distinct status st, count(*) “HOW MANY”, sum(bytes) “SIZE”
from dba_undo_extents
group by status
/

col segment_name format a30 heading “Name”
col TABLESPACE_NAME for a20
col BYTES for 999,999,999,999
col BLOCKS for 999,999,999
col status for a15 heading “Status”
col segment_name heading “Segment”
col extent_id heading “ID”

select SEGMENT_NAME, TABLESPACE_NAME, EXTENT_ID,
FILE_ID, BLOCK_ID, BYTES, BLOCKS, STATUS
from dba_undo_extents
order by 1,3,4,5
/

REM
REM —————————————————————–
REM
REM Undo Extents Contention breakdown
REM Take out column TUNED_UNDORETENTION if customer
REM prior to 10.2.x
REM
REM The time frame can be adjusted with this query
REM By default using around 4 hour window of time
REM
REM Ex.
REM Using sysdate-.04 looking at the last hour
REM Using sysdate-.16 looking at the last 4 hours
REM Using sysdate-.32 looking at the last 8 hours
REM Using sysdate-1 looking at the last 24 hours
REM

set linesize 140

ttitle center “Undo Extents Error Conditions (Default – Last 4 Hours)” skip 2

col UNXPSTEALCNT format 999,999,999 heading “# Unexpired|Stolen”
col EXPSTEALCNT format 999,999,999 heading “# Expired|Reused”
col SSOLDERRCNT format 999,999,999 heading “ORA-1555|Error”
col NOSPACEERRCNT format 999,999,999 heading “Out-Of-space|Error”
col MAXQUERYLEN format 999,999,999 heading “Max Query|Length”
col TUNED_UNDORETENTION format 999,999,999 heading “Auto-Ajusted|Undo Retention”
col hours format 999,999 heading “Tuned|(HRs)”

select inst_id, to_char(begin_time,’MM/DD/YYYY HH24:MI’) begin_time,
UNXPSTEALCNT, EXPSTEALCNT , SSOLDERRCNT, NOSPACEERRCNT, MAXQUERYLEN,
TUNED_UNDORETENTION, TUNED_UNDORETENTION/60/60 hours
from gv$undostat
where begin_time between (sysdate-.16)
and sysdate
order by inst_id, begin_time
/

spool off
set termout on
set trimout off
set trimspool off

6.undoheathcheck.sql

spool Undohealth.out

ttitle off
set pages 999
set lines 150
set verify off
set termout off
set trimout on
set trimspool on

REM
REM ————————————————————————

col name format a30
col gets format 9,999,999
col waits format 9,999,999

PROMPT ROLLBACK HIT STATISTICS:
REM

REM GETS – # of gets on the rollback segment header
REM WAITS – # of waits for the rollback segment header

set head on;

select name, waits, gets
from v$rollstat, v$rollname
where v$rollstat.usn = v$rollname.usn
/

col pct head “< 2% ideal” select ‘The average of waits/gets is ‘|| round((sum(waits) / sum(gets)) * 100,2)||’%’ PCT From v$rollstat / PROMPT REDO CONTENTION STATISTICS: REM REM If the ratio of waits to gets is more than 1% or 2%, consider REM creating more rollback segments REM REM Another way to gauge rollback contention is: REM column xn1 format 9999999 column xv1 new_value xxv1 noprint select class, count from v$waitstat where class in (‘system undo header’, ‘system undo block’, ‘undo header’, ‘undo block’ ) / set head off select ‘Total requests = ‘||sum(count) xn1, sum(count) xv1 from v$waitstat / select ‘Contention for system undo header = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘system undo header’ / select ‘Contention for system undo block = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘system undo block’ / select ‘Contention for undo header = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘undo header’ / select ‘Contention for undo block = ‘|| (round(count/(&xxv1+0.00000000001),4)) * 100||’%’ from v$waitstat where class = ‘undo block’ / REM REM NOTE: Not as useful with AUM configured REM REM If the percentage for an area is more than 1% or 2%, consider REM creating more rollback segments. Note: This value is usually very REM small REM and has been rounded to 4 places. REM REM ———————————————————————— REM REM The following shows how often user processes had to wait for space in REM the redo log buffer: select name||’ = ‘||value from v$sysstat where name = ‘redo log space requests’ / REM REM This value should be near 0. If this value increments consistently, REM processes have had to wait for space in the redo buffer. If this REM condition exists over time, increase the size of LOG_BUFFER in the REM init.ora file in increments of 5% until the value nears 0. REM ** NOTE: increasing the LOG_BUFFER value will increase total SGA size. REM REM ———————————————————————– col name format a15 col gets format 9999999 col misses format 9999999 col immediate_gets heading ‘IMMED GETS’ format 9999999 col immediate_misses heading ‘IMMED MISS’ format 9999999 col sleeps format 999999 PROMPT LATCH CONTENTION: REM REM GETS – # of successful willing-to-wait requests for a latch REM MISSES – # of times an initial willing-to-wait request was unsuccessful REM IMMEDIATE_GETS – # of successful immediate requests for each latch REM IMMEDIATE_MISSES = # of unsuccessful immediate requests for each latch REM SLEEPS – # of times a process waited and requests a latch after an REM initial willing-to-wait request REM REM If the latch requested with a willing-to-wait request is not REM available, the requesting process waits a short time and requests REM again. REM If the latch requested with an immediate request is not available, REM the requesting process does not wait, but continues processing REM set head on select name, gets, misses, immediate_gets, immediate_misses, sleeps from v$latch where name in (‘redo allocation’, ‘redo copy’) / set head off select ‘Ratio of MISSES to GETS: ‘|| round((sum(misses)/(sum(gets)+0.00000000001) * 100),2)||’%’ from v$latch where name in (‘redo allocation’, ‘redo copy’) / select ‘Ratio of IMMEDIATE_MISSES to IMMEDIATE_GETS: ‘|| round((sum(immediate_misses)/ (sum(immediate_misses+immediate_gets)+0.00000000001) * 100),2)||’%’ from v$latch where name in (‘redo allocation’, ‘redo copy’) / set head on REM REM If either ratio exceeds 1%, performance will be affected. REM REM Decreasing the size of LOG_SMALL_ENTRY_MAX_SIZE reduces the number of REM processes copying information on the redo allocation latch. REM REM Increasing the size of LOG_SIMULTANEOUS_COPIES will reduce contention REM for redo copy latches. REM REM —————————————————————– REM This looks at overall i/o activity against individual REM files within a tablespace REM REM Look for a mismatch across disk drives in terms of I/O REM REM Also, examine the Blocks per Read Ratio for heavily accessed REM TSs – if this value is significantly above 1 then you may have REM full tablescans occurring (with multi-block I/O) REM REM If activity on the files is unbalanced, move files around to balance REM the load. Should see an approximately even set of numbers across files REM set space 1 PROMPT REPORTING I/O STATISTICS: column pbr format 99999999 heading ‘Physical|Blk Read’ column pbw format 999999 heading ‘Physical|Blks Wrtn’ column pyr format 999999 heading ‘Physical|Reads’ column readtim format 99999999 heading ‘Read|Time’ column name format a55 heading ‘DataFile Name’ column writetim format 99999999 heading ‘Write|Time’ compute sum of f.phyblkrd, f.phyblkwrt on report select fs.name name, f.phyblkrd pbr, f.phyblkwrt pbw, f.readtim, f.writetim from v$filestat f, v$datafile fs where f.file# = fs.file# order by fs.name / REM REM —————————————————————– PROMPT GENERATING WAIT STATISTICS: REM REM This will show wait stats for certain kernel instances. This REM may show the need for additional rbs, wait lists, db_buffers REM column class heading ‘Class Type’ column count heading ‘Times Waited’ format 99,999,999 column time heading ‘Total Times’ format 99,999,999 select class, count, time from v$waitstat where count > 0
order by class
/

REM
REM Look at the wait statistics generated above (if any). They will
REM tell you where there is contention in the system. There will
REM usually be some contention in any system – but if the ratio of
REM waits for a particular operation starts to rise, you may need to
REM add additional resource, such as more database buffers, log buffers,
REM or rollback segments
REM
REM —————————————————————–

PROMPT ROLLBACK EXTENT STATISTICS:
REM

column usn format 999 heading ‘Undo #’
column extents format 999 heading ‘Extents’
column rssize format 999,999,999 heading ‘Size in|Bytes’
column optsize format 999,999,999 heading ‘Optimal|Size’
column hwmsize format 99,999,999 heading ‘High Water|Mark’
column shrinks format 9,999 heading ‘Num of|Shrinks’
column wraps format 9,999 heading ‘Num of|Wraps’
column extends format 999,999 heading ‘Num of|Extends’
column aveactive format 999,999,999 heading ‘Average size|Active Extents’
column rownum noprint

select usn, extents, rssize, optsize, hwmsize,
shrinks, wraps, extends, aveactive
from v$rollstat
order by rownum
/

spool off
set termout on
set trimout off
set trimspool off

7.undohistoryinfo.sql

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-hh24:mi’;

spool undohistoryinfo.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## HISTORICAL DATA ##############
prompt

col x format 999,999 head “Max Concurrent|Last 7 Days”
col y format 999,999 head “Max Concurrent|Since Startup”

select max(maxconcurrency) x from v$undostat
/
select max(maxconcurrency) y from sys.wrh$_undostat
/

col i format 999,999 head “1555 Errors”
col j format 999,999 head “Undo Space Errors”

select sum(ssolderrcnt) i from v$undostat
where end_time > sysdate-2
/

select sum(nospaceerrcnt) j from v$undostat
where end_time > sysdate-2
/
clear break
clear compute

prompt
prompt ############## CURRENT STATUS OF SEGMENTS ##############
prompt ############## SNAPSHOT IN TIME INFO ##############
prompt ##############(SHOWS CURRENT UNDO ACTIVITY)##############
prompt

col segment_name format a30 head “Segment Name”
col “ACT BYTES” format 999,999,999,999 head “Active Bytes”
col “UNEXP BYTES” format 999,999,999,999 head “Unexpired Bytes”
col “EXP BYTES” format 999,999,999,999 head “Expired Bytes”

select segment_name, nvl(sum(act),0) “ACT BYTES”,
nvl(sum(unexp),0) “UNEXP BYTES”,
nvl(sum(exp),0) “EXP BYTES”
from (select segment_name, nvl(sum(bytes),0) act,00 unexp, 00 exp
from dba_undo_extents where status=’ACTIVE’ group by segment_name
union
select segment_name, 00 act, nvl(sum(bytes),0) unexp, 00 exp
from dba_undo_extents where status=’UNEXPIRED’ group by segment_name
union
select segment_name, 00 act, 00 unexp, nvl(sum(bytes),0) exp
from dba_undo_extents where status=’EXPIRED’ group by segment_name)
group by segment_name
order by 1
/

prompt
prompt ############## UNDO SPACE USAGE ##############
prompt

col usn format 999,999 head “Segment#”
col shrinks format 999,999,999 head “Shrinks”
col aveshrink format 999,999,999 head “Avg Shrink Size”

select usn, shrinks, aveshrink from v$rollstat
/
spool off
set termout on
set trimout off
set trimspool off
clear col

8.undoparameters.sql

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

spool undoparameters.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

col inst_id format 999 head “Instance #”
col Parameter format a35 wrap
col “Session Value” format a25 wrapped
col “Instance Value” format a25 wrapped

prompt
prompt ############## PARAMETERS ##############
prompt

select a.inst_id, a.ksppinm “Parameter”,
b.ksppstvl “Session Value”,
c.ksppstvl “Instance Value”
from x$ksppi a, x$ksppcv b, x$ksppsv c
where a.indx = b.indx and a.indx = c.indx
and a.inst_id=b.inst_id and b.inst_id=c.inst_id
and a.ksppinm in (‘_undo_autotune’, ‘_smu_debug_mode’,
‘_highthreshold_undoretention’,
‘undo_tablespace’,’undo_retention’,’undo_management’)
order by 2;

spool off
set termout on
set trimout off
set trimspool off
clear col

9. undopressure.sql

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-hh24:mi’;

spool undopressure.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## WAITS FOR UNDO (Since Startup) ##############
prompt

col inst_id head “Instance#”
col eq_type format a3 head “Enq”
col total_req# format 999,999,999,999,999,999 head “Total Requests”
col total_wait# format 999,999 head “Total Waits”
col succ_req# format 999,999,999,999,999,999 head “Successes”
col failed_req# format 999,999,999999 head “Failures”
col cum_wait_time format 999,999,999 head “Cummalitve|Time”

select * from v$enqueue_stat where eq_type=’US’
union
select * from v$enqueue_stat where eq_type=’HW’
/

prompt
prompt ############## LOCKS FOR UNDO ##############
prompt

col addr head “ADDR”
col KADDR head “KADDR”
col sid head “Session”
col osuser format a10 head “OS User”
col machine format a15 head “Machine”
col program format a17 head “Program”
col process format a7 head “Process”
col lmode head “Lmode”
col request head “Request”
col ctime format 9,999 head “Time|(Mins)”
col block head “Blocking?”

select /*+ RULE */ a.SID, b.process,
b.OSUSER, b.MACHINE, b.PROGRAM,
addr, kaddr, lmode, request, round(ctime/60/60,0) ctime, block
from
v$lock a,
v$session b
where
a.sid=b.sid
and a.type=’US’
/

prompt
prompt ############## TUNED RETENTION HISTORY (Last 2 Days) ##############
prompt ############## LOWEST AND HIGHEST DATA ##############
prompt

col low format 999,999,999,999 head “Undo Retention|Lowest Tuned Value”
col high format 999,999,999,999 head “Undo Retention|Highest Tuned Value”

select end_time, tuned_undoretention from v$undostat where tuned_undoretention = (
select min(tuned_undoretention) low
from v$undostat
where end_time > sysdate-2)
/

select end_time, tuned_undoretention from v$undostat where tuned_undoretention = (
select max(tuned_undoretention) high
from v$undostat
where end_time > sysdate-2)
/

prompt
prompt ############## CURRENT TRANSACTIONS ##############
prompt

col sql_text format a40 word_wrapped head “SQL Code”

select a.start_date, a.start_scn, a.status, c.sql_text
from v$transaction a, v$session b, v$sqlarea c
where b.saddr=a.ses_addr and c.address=b.sql_address
and b.sql_hash_value=c.hash_value
/

select current_scn from v$database
/

col a format 999,999 head “UnexStolen”
col b format 999,999 head “ExStolen”
col c format 999,999 head “UnexReuse”
col d format 999,999 head “ExReuse”

prompt
prompt ############## WHO’S STEALING WHAT? (Last 2 Days) ##############
prompt

select unxpstealcnt a, expstealcnt b,
unxpblkreucnt c, expblkreucnt d
from v$undostat
where (unxpstealcnt > 0 or expstealcnt > 0)
and end_time > sysdate-2
/

spool off
set termout on
set trimout off
set trimspool off
clear col

10. undostatistics.sql

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-hh24:mi’;

spool undostatistics.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

col current_scn head “SCN Now”
col start_date head “Trans Started”
col start_scn head “SCN for Trans”
col ses_addr head “ADDR”

prompt
prompt ############## Historical V$UNDOSTAT (Last 2 Days) ##############
prompt

col end_time format a18 Head “Date/Time”
col maxq format 999,999 head “Query|Maximum|Minutes”
col maxquerysqlid head “SqlID”
col undotsn format 999,999 head “TBS”
col undoblks format 999,999,999 head “Undo|Blocks”
col txncount format 999,999,999 head “# of|Trans”
col unexpiredblks format 999,999,999 head “# of Unexpired”
col expiredblks format 999,999,999 head “# of Expired”
col tuned format 999,999 head “Tuned Retention|(Minutes)”

select end_time, round(maxquerylen/60,0) maxq, maxquerysqlid,
undotsn, undoblks, txncount, unexpiredblks, expiredblks,
round(tuned_undoretention/60,0) Tuned
from dba_hist_undostat
where end_time > sysdate-2
order by 1
/

prompt
prompt ############## RECENT MISSES FOR UNDO (Last 2 Days) ##############
prompt

clear col
set lines 500
select * from v$undostat where maxquerylen > tuned_undoretention
and end_time > sysdate-2
order by 2
/

select * from sys.wrh$_undostat where maxquerylen > tuned_undoretention
and end_time > sysdate-2
order by 2
/

prompt
prompt ############## AUTO-TUNING TUNE-DOWN DATA ##############
prompt ############## ROLLBACK DATA (Since Startup) ##############
prompt

col name format a60 head “Name”
col value format 999,999,999 head “Counters”

select name, value from v$sysstat
where name like ‘%down retention%’ or name like ‘une down%’
or name like ‘%undo segment%’ or name like ‘%rollback%’
or name like ‘%undo record%’
/

prompt
prompt ############## Long Running Query History ##############
prompt

col end_time head “Date”
col maxquerysqlid head “SQL ID”
col runawayquerysqlid format a15 head “Runaway SQL ID”
col results format a35 word_wrapped head “Space Issues”
col status head “Status”
col newret head “Tuned Down|Retention”

select end_time, maxquerysqlid, runawayquerysqlid, status,
decode(status,1,’Slot Active’,4,’Reached Best Retention’,5,’Reached Best Retention’,
8, ‘Runaway Query’,9,’Runaway Query-Active’,10,’Space Pressure’,
11,’Space Pressure Currently’,
16, ‘Tuned Down (to undo_retention) due to Space Pressure’,
17,’Tuned Down (to undo_retention) due to Space Pressure-Active’,
18, ‘Tuning Down due to Runaway’, 19, ‘Tuning Down due to Runaway-Active’,
28, ‘Runaway tuned down to last tune down value’,
29, ‘Runaway tuned down to last tune down value’,
32, ‘Max Tuned Down – Not Auto-Tuning’,
33, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
37, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
38, ‘Max Tuned Down – Not Auto-Tuning’,
39, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
40, ‘Max Tuned Down – Not Auto-Tuning’,
41, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
42, ‘Max Tuned Down – Not Auto-Tuning’,
44, ‘Max Tuned Down – Not Auto-Tuning’,
45, ‘Max Tuned Down – Not Auto-Tuning (Active)’,
‘Other (‘||status||’)’) Results, spcprs_retention NewRet
from sys.wrh$_undostat
where status > 1
/

prompt
prompt ############## Details on Long Run Queries ##############
prompt

col sql_fulltext head “SQL Text”
Col sql_id heading “SQL ID”

select sql_id, sql_fulltext, last_load_time “Last Load”,
round(elapsed_time/1000000/60/60/24,0) “Elapsed Days”
from v$sql where sql_id in
(select maxquerysqlid from sys.wrh$_undostat
where status > 1)
/

spool off
set termout on
set trimout off
set trimspool off
clear col

11.

set lines 120
set pages 999
clear col

set termout off
set trimout on
set trimspool on

connect / as sysdba
alter session set nls_date_format=’dd-Mon-yyyy hh24:mi’;

spool undousage.out

prompt
prompt ############## RUNTIME ##############
prompt

col rdate head “Run Time”

select sysdate rdate from dual;

prompt
prompt ############## IN USE Undo Data ##############
prompt

select
((select (nvl(sum(bytes),0))
from dba_undo_extents
where tablespace_name in (select tablespace_name from dba_tablespaces
where retention like ‘%GUARANTEE’ )
and status in (‘ACTIVE’,’UNEXPIRED’)) *100) /
(select sum(bytes)
from dba_data_files
where tablespace_name in (select tablespace_name from dba_tablespaces
where retention like ‘%GUARANTEE’ )) “PCT_INUSE”
from dual;

select tablespace_name, extent_management, allocation_type,
segment_space_management, retention
from dba_tablespaces where retention like ‘%GUARANTEE’
/

col c format 999,999,999,999 head “Sum of Free”

select (nvl(sum(bytes),0)) c from dba_free_space
where tablespace_name in
(select tablespace_name from dba_tablespaces where retention like ‘%GUARANTEE’)
/

col d format 999,999,999,999 head “Total Bytes”

select sum(bytes) d from dba_data_files
where tablespace_name in
(select tablespace_name from dba_tablespaces where retention like ‘%GUARANTEE’)
/

PROMPT
PROMPT ############## UNDO SEGMENTS ##############
PROMPT

col status head “Status”
col z format 999,999 head “Total Extents”
break on report
compute sum on report of z

select status, count(*) z from dba_undo_extents
group by status
/

col z format 999,999 head “Undo Segments”

select status, count(*) z from dba_rollback_segs
group by status
/

clear break
clear compute

prompt
prompt ############## CURRENT STATUS OF SEGMENTS ##############
prompt ############## SNAPSHOT IN TIME INFO ##############
prompt ##############(SHOWS CURRENT UNDO ACTIVITY)##############
prompt

col segment_name format a30 head “Segment Name”
col “ACT BYTES” format 999,999,999,999 head “Active Bytes”
col “UNEXP BYTES” format 999,999,999,999 head “Unexpired Bytes”
col “EXP BYTES” format 999,999,999,999 head “Expired Bytes”

select segment_name, nvl(sum(act),0) “ACT BYTES”,
nvl(sum(unexp),0) “UNEXP BYTES”,
nvl(sum(exp),0) “EXP BYTES”
from (select segment_name, nvl(sum(bytes),0) act,00 unexp, 00 exp
from dba_undo_extents where status=’ACTIVE’ group by segment_name
union
select segment_name, 00 act, nvl(sum(bytes),0) unexp, 00 exp
from dba_undo_extents where status=’UNEXPIRED’ group by segment_name
union
select segment_name, 00 act, 00 unexp, nvl(sum(bytes),0) exp
from dba_undo_extents where status=’EXPIRED’ group by segment_name)
group by segment_name
order by 1
/

prompt
prompt ############## UNDO SPACE USAGE ##############
prompt

col usn format 999,999 head “Segment#”
col shrinks format 999,999,999 head “Shrinks”
col aveshrink format 999,999,999 head “Avg Shrink Size”

select usn, shrinks, aveshrink from v$rollstat
/
spool off
set termout on
set trimout off
set trimspool off
clear col

自动undo管理模式下的undo错误分析常用脚本

归档日志间隙（Archive Gap）

归档日志间隙是在 Standby 端一系列丢失的重做日志，导致日志应用服务无法继续运行。这通常发生在 Standby 端无法从 Primary Database 接收重做日志或重做日志在 Standby Database 上不可用时。常见原因有:

网络连接断开或者日志传输服务停止
Standby Database 不可用
日志传输服务的配置错误
Standby 端的 IO 问题
归档日志在应用到 Standby 前被手工删除
Primary 和 Standby 之间的网络带宽不足

一旦在 Standby Database 上存在归档间隙，Log Apply Services 就会卡住，直到日志间隙（Gap）被解决，例如。丢失的 Redo 被重新获取并且在 Standby 端可用。然后，日志应用服务可以选中它并继续处理。

解决日志间隙的方式

有4种方案来解决 Standby Database 上的日志间隙。这些方案在下面讨论。

自动日志间隙解决方案

自动日志间隙解决方案是由日志传输服务自动进行的。它会把当前正在传输的日志和最近收到的日志进行对比，如果有不匹配的情况出现，Standby 端的 RFS 进程就会检测到并自动发送 ARCH-RFS 心跳 Ping 请求来要求发送丢失的日志。这种类型的日志间隙解决方法使用了主数据库上的 log_archive_dest_n 中定义的 Service。另外 ARCH-RFS 心跳 Ping 可以通过对当前的日子序列号进行查询来检测日志间隙。如果存在日志间隙则仍通过ARCH-RFS 心跳 Ping 请求来解决它。在问题得到解决后，会通知日志传输进程（ARCH 或者 LGWR）。对于自动日志间隙解决方案，不需要额外的设置或者监控。

FAL (Fetch Archive Log)日志间隙解决方案

当一个归档日志在 Standby 数据库上被收到或者归档，它就会被注册到 Standby 的控制文件中。（您可以在物理 Standby 数据库上查询 v$archived_log 或在逻辑 Standby 数据库上查询 dba_logstdby_log 来获取这些注册信息）。如果这个文件因为某些原因丢失或者损坏（比如，它被意外删除），FAL 就会被调用来解决日志间隙问题。因为这些缺失的日志文件通常由 Standby 数据库上的日志应用服务检测到。它独立于日志传输服务，并且没有和主库之间的直接链接。要使用 FAL，必须在 Standby 数据库设置一个或者两个（11.2.0 之前的版本）初始化参数：

FAL_SERVER：设置 Oracle Net Service Name（TNS-Alias 或者 Connect Descriptor）指向用来获取丢失的归档日志的数据库。它可以是一个 Data Guard 环境的主库，或者是另一个备库，ArchiveLog Repository- 或者 Far Sync Standby (> Oracle 12.1.0) Database。可以指定多个 Service Names（逗号分隔）。FAL 会顺序的尝试这些数据库来解决日志间隙问题。

FAL_CLIENT (< Oracle 11.2.0)：在 FAL_SERVER 数据库上设置 Oracle Net Service Name（TNS-Alias 或 Connect Descriptor）指向接收 REDO 的 Standby 数据库（比如，它是 FAL_SERVER 数据库需要发送 REDO 到的目标数据库）。确保这个 TNS-Alias 存在于 FAL_SERVER 数据库的 TNSNAMES.ORA 文件中。从 Oracle 11.2.0 开始，这个参数不再需要。但是需要确保 FAL_SERVER 数据库存在一个log_archive_dest_n 指向要解决日志间隙问题的 Standby 数据库。

当 Log ApplyServices 检测到日志间隙问题，它会发送一个 FAL 请求把 FAL_CLIENT 信息（Version > 11.1.0 则为 db_unique_name）给 FAL_SERVER。一个 FAL_SERVER 数据库上的 ARCH-Process 会尝试获取那个日志并发送回 FAL_CLIENT（或者 db_unique_name 对应的 Destination）。如果第一个 FAL_SERVER 无法解决日志间隙，会尝试列表中的下一个 FAL_SERVER。如果所有的 FAL_SERVERs 都无法解决，那么 FAL 请求会失败，并且一个对应的错误信息会写入对应 Standby 数据库的 ALERT.LOG。

为了成功解决日志间隙问题，需要的归档日志应当存在于 FAL_SERVER 数据库（存在于磁盘并且对应的信息同时存在于控制文件中）。

FAL 从 Oracle 9.2.0 开始可用于物理 Standby 数据库，从 Oracle 10.1.0 开始可用于逻辑 Standby 数据库。

手工解决日志间隙

如果日志间隙问题不能被上面提到方式解决，那么可以尝试手工解决。

可以通过查询物理 Standby 数据库的 $archive_gap 或者逻辑 Standby 数据库的 dba_logstdby_log 来确定当前的归档日志间隙，例如：

物理 Standby 数据库

SQL> select * from v$archive_gap;

逻辑 Standby 数据库

SQL> select thread#, sequence# from dba_logstdby_log l where next_change# not in

(select first_change# from dba_logstdby_log where l.thread#=thread#)

order by thread#, sequence#;

现在复制缺失的特定编号的 redo 日志到 Standby 数据库的对应位置。如缺失的日志尚未注册到 Standby 数据库，需要先注册它们才能让 Log Apply Services 处理这些日志文件。可以使用下面的命令注册：

物理 Standby：

SQL> alter database register logfile ‘<File-Specification>’;

逻辑 Standby：

SQL> alter database register logical logfile ‘<File-Specification>’;

在它们被注册后，Log Apply Services 就可以处理了。

使用增量备份前滚（仅适用于物理 Standby）

如果日志间隙无法被上面提到的方式解决，间隙太大需要太久时间才能解决或者丢失的日志无法被找到，您仍然可以通过使用 SCN 增量备份来前滚物理 Standby 数据库。这个功能从 Oracle 10.2.0 开始可以使用。这个功能通过记录最后应用到 Standby 数据库的SCN，然后使用 RMAN 以及当前控制文件的备份来对主库创建一个从那个 SCN 开始的增量备份。

之后首先用增量备份中的控制文件替换 Standby 的控制文件，之后应用增量备份到 Standby 数据库。这是一个把 Standby 数据库同步到最新的主库的状态的最快最简单的方式。因为采取的步骤各个版本都不同

Oracle dataguard 日志缺失的解决办法

概述

某客户由存储厂商维护人员将LUN存储划分到EMR数据库主机上；在此期间由于不当操作，导致数据库集群状态异常，之后重启了数据库主机节点2，重启主机后集群软件GRID无法正常启动，节点1仍可以正常提供数据库服务。通过远程方式接入数据库环境进行故障排查，通过对相关日志等信息的深入分析，最终找到问题点并进行处理，数据库集群恢复正常。

集群及数据库日志分析

1. 15:36分主机重启前的错误信息

从此处日志可以看出，在15:36分时数据库实例报IO读写错误，同时数据库集群日志中也在显示报错，之后发生了主机重启动作。

2. 节点2重启主机后相关日志信息

节点2重启主机后，检查集群启动状态可以发现在启动到ASM实例时停止，检查ASM ALERT日志有如下信息：

集群ALERT日志：

2021-12-12 15:48:34.081:

[client(10295)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

[client(10405)]CRS-10001:12-Dec-19 15:48 ACFS-9391: Checking for existing ADVM/ACFS installation.

[client(10410)]CRS-10001:12-Dec-19 15:48 ACFS-9392: Validating ADVM/ACFS installation files for operating system.

[client(10412)]CRS-10001:12-Dec-19 15:48 ACFS-9393: Verifying ASM Administrator setup.

[client(10415)]CRS-10001:12-Dec-19 15:48 ACFS-9308: Loading installed ADVM/ACFS drivers.

[client(10418)]CRS-10001:12-Dec-19 15:48 ACFS-9154: Loading ‘oracleoks.ko’ driver.

[client(10446)]CRS-10001:12-Dec-19 15:48 ACFS-9154: Loading ‘oracleadvm.ko’ driver.

[client(10473)]CRS-10001:12-Dec-19 15:48 ACFS-9154: Loading ‘oracleacfs.ko’ driver.

[client(10560)]CRS-10001:12-Dec-19 15:48 ACFS-9327: Verifying ADVM/ACFS devices.

[client(10564)]CRS-10001:12-Dec-19 15:48 ACFS-9156: Detecting control device ‘/dev/asm/.asm_ctl_spec’.

[client(10568)]CRS-10001:12-Dec-19 15:48 ACFS-9156: Detecting control device ‘/dev/ofsctl’.

[client(10573)]CRS-10001:12-Dec-19 15:48 ACFS-9322: completed

2021-12-12 15:48:43.930:

[client(10374)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:48:53.698:

[client(10598)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:49:03.129:

[client(10905)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:50:00.511:

[client(12458)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxemrdb02/client/crsctl_oracle.log.

2021-12-12 15:51:07.276:

节点2：ASM ALERT日志

Thu Dec 12 16:22:11 2021

MMNL started with pid=21, OS id=26248

lmon registered with NM – instance number 2 (internal mem no 1)

Thu Dec 12 16:24:08 2021

LMON received an instance eviction notification from instance 1

The instance eviction reason is 0x20000000

The instance eviction map is 2

Thu Dec 12 16:24:11 2021

PMON (ospid: 26206): terminating the instance due to error 481

Thu Dec 12 16:24:11 2021

ORA-1092 : opitsk aborting process

Thu Dec 12 16:24:13 2021

System state dump requested by (instance=2, osid=26206 (PMON)), summary=[abnormal instance termination].

System State dumped to trace file /u01/app/grid/diag/asm/+asm/+ASM2/trace/+ASM2_diag_26216_20211212162413.trc

Dumping diagnostic data in directory=[cdmp_20211212162411], requested by (instance=2, osid=26206 (PMON)), summary=[abnormal instance termination].

Instance terminated by PMON, pid = 26206

检查对应的LMON及DIAG日志：

[grid@cxemrdb02 trace]$ more ./+ASM2_diag_26216.trc

Trace file /u01/app/grid/diag/asm/+asm/+ASM2/trace/+ASM2_diag_26216.trc

Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 – 64bit Production

With the Real Application Clusters and Automatic Storage Management options

ORACLE_HOME = /u01/app/11.2.0/grid

System name: ???Linux

Node name: ?????cxemrdb02

Release: ???????2.6.32-431.el6.x86_64

Version: ???????#1 SMP Sun Nov 10 22:19:54 EST 2013

Machine: ???????x86_64

Instance name: +ASM2

Redo thread mounted by this instance: 0 <none>

Oracle process number: 6

Unix process pid: 26216, image: oracle@cxemrdb02 (DIAG)

*** 2021-12-12 16:22:10.788

*** SESSION ID:(373.1) 2021-12-12 16:22:10.788

*** CLIENT ID:() 2021-12-12 16:22:10.788

*** SERVICE NAME:() 2021-12-12 16:22:10.788

*** MODULE NAME:() 2021-12-12 16:22:10.788

*** ACTION NAME:() 2021-12-12 16:22:10.788

Node id: 1

List of nodes: 0, 1,

*** 2021-12-12 16:22:10.790

Reconfiguration starts [incarn=0]

*** 2021-12-12 16:22:10.790

I’m the voting node

Group reconfiguration cleanup

kjzdattdlm: Can not attach to DLM (LMON up=[TRUE], DB mounted=[FALSE]).

3. 问题分析描述

通过对数据库集群相关日志的分析，可以发现在15:36分时的非正常操作，导致了数据库及集群报错并状态异常，此时进行了数据库主机的重启动作；重启后集群日志中显示ASM无法启动，根据ASM日志及TRACE文件信息，结合ORACLE官方文档中ASM on Non-First Node (Second or Others) Fails to Start: PMON (ospid: nnnn): terminating the instance due to error 481 (Doc ID 1383737.1)文档中描述的信息，可以判断是集群私网通信出现了问题，通过对集群私网相关配置进行分析、排查，最终确认是集群通信的socket文件(/var/tmp/.oracle)内容由于集群异常关闭导致不一致，从而引起集群节点2重启后不能正常启动。通过清理socket文件后集群可以正常启动恢复正常。

三、总结与后续处理建议

1. 问题总结

通过对数据库集群相关日志的分析，可以发现在15:36分时的非正常操作，导致了数据库及集群报错并状态异常，此时进行了数据库主机的重启动作；重启后集群日志中显示ASM无法启动，根据ASM日志及TRACE文件信息，结合ORACLE官方文档中ASM on Non-First Node (Second or Others) Fails to Start: PMON (ospid: nnnn): terminating the instance due to error 481 (Doc ID 1383737.1)文档中描述的信息，可以判断是集群私网通信出现了问题；通过对集群私网相关配置进行分析、排查，最终确认是集群通信的socket文件(/var/tmp/.oracle)内容由于集群异常关闭导致不一致，从而引起集群节点2重启后与节点1通信异常，不能正常启动。通过清理socket文件后集群可以正常启动，数据库集群恢复正常。

4. 后续对于集群添加磁盘的建议

对于生产环境数据库集群，向ASM环境中添加磁盘是一个重要且有一点风险的动作，由于数据库所有数据都存放在ASM磁盘组中，因此如果向已有ASM磁盘组添加磁盘不当，可能导致ASM磁盘组读写异常或不能正常MOUNT，从而影响正常的数据库业务运行。因此结合ORACLE官方的标准步骤”如何在 RAC 集群或单机 ASM 环境中对已经存在的 Diskgroup 添加新磁盘（最佳实践） (文档 ID 1946668.1)”以及我们多年的数据库维护经验，

建议后续添加ASM磁盘时遵从如下步骤：

步骤	操作明细
1	对现有集群状态、ASM磁盘状态、使用率、磁盘大小检查
2	存储层面进行相应的LUN划分，建议划分的LUN大小与原ASM磁盘组中的磁盘大小一致
3	存储层新划分LUN到主机操作系统层面的映射，确认集群中每个主机可以正常识别读写LUN
4	参照已有的ASM磁盘配置相应的别名或多路径映射，集群kfed命令可以读取
5	使用新划分LUN新建临时测试磁盘组确认新划分LUN可以被ASM使用
6	删除新加的测试磁盘组
7	将新划分LUN加入所需的ASM磁盘组中
8	检查ASM磁盘组rebalance完成状态，检查集群及数据库的状态

Oracle Asm 增加磁盘操作不当引起ASM无法启动故障处理记录

概述

医院数据库监控平台显示连接HIS数据库1节点异常，通过远程的方式接入数据库进行故障排除，快速解决了问题使得业务恢复正常使用。

问题及相关日志分析

1. 检查监听

通过2021/05/18的巡检报告中发现监听中存在1.80的IP地址，而此次检查发现监听中无1.80的IP地址。

2. 检查集群状态

检查集群状态，发现集群中多个资源状态显示OFFLINE。

[grid@cxhisdb02 ~]$ crsctl stat res -t

——————————————————————————–

NAME TARGET STATE SERVER STATE_DETAILS

——————————————————————————–

Local Resources

——————————————————————————–

ora.DATA.dg

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.LISTENER.lsnr

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.OCR.dg

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.SSD.dg

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.asm

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02 Started

ora.gsd

OFFLINE OFFLINE cxhisdb01

OFFLINE OFFLINE cxhisdb02

ora.net1.network

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.ons

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.registry.acfs

ONLINE OFFLINE cxhisdb01

ONLINE ONLINE cxhisdb02

——————————————————————————–

Cluster Resources

——————————————————————————–

ora.LISTENER_SCAN1.lsnr

1 ONLINE ONLINE cxhisdb02

ora.cvu

1 ONLINE OFFLINE

ora.cxhisdb01.vip

1 ONLINE OFFLINE

ora.cxhisdb02.vip

1 ONLINE ONLINE cxhisdb02

ora.hospital.db

1 ONLINE OFFLINE

2 ONLINE ONLINE cxhisdb02 Open

ora.oc4j

1 ONLINE OFFLINE

ora.scan1.vip

1 ONLINE ONLINE cxhisdb02

3. 检查集群日志

仅有CRSD服务异常，其他集群资源均正常，因此数据库未宕机。

2021-05-26 13:23:46.059:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:46.068:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:56.293:

[/u01/app/11.2.0/grid/bin/oraagent.bin(66885)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/oraagent_grid’ disconnected from server. Details at (:CRSAGF00117:) {0:21:18} in

/u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/oraagent_grid/oraagent_grid.log.

2021-05-26 13:23:56.294:

[/u01/app/11.2.0/grid/bin/oraagent.bin(31320)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/oraagent_oracle’ disconnected from server. Details at (:CRSAGF00117:) {0:19:5060

3} in /u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/oraagent_oracle/oraagent_oracle.log.

2021-05-26 13:23:56.461:

[/u01/app/11.2.0/grid/bin/orarootagent.bin(145347)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/orarootagent_root’ disconnected from server. Details at (:CRSAGF00117:) {0:

5:1568} in /u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/orarootagent_root/orarootagent_root.log.

2021-05-26 13:23:56.485:

[/u01/app/11.2.0/grid/bin/scriptagent.bin(145549)]CRS-5822:Agent ‘/u01/app/11.2.0/grid/bin/scriptagent_grid’ disconnected from server. Details at (:CRSAGF00117:) {0:9:

68} in /u01/app/11.2.0/grid/log/cxhisdb01/agent/crsd/scriptagent_grid/scriptagent_grid.log.

2021-05-26 13:23:56.651:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:23:58.540:

[crsd(5795)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:58.548:

[crsd(5795)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:58.964:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:00.374:

[crsd(5834)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:00.382:

[crsd(5834)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:01.010:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:02.447:

[crsd(5886)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:02.455:

[crsd(5886)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:03.068:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:04.457:

[crsd(5909)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:04.465:

[crsd(5909)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:05.102:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:06.492:

[crsd(5937)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:06.501:

[crsd(5937)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:07.132:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:08.517:

[crsd(5986)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:08.525:

[crsd(5986)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:09.162:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:10.544:

[crsd(6015)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:10.552:

[crsd(6015)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:11.193:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:12.581:

[crsd(6051)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:12.589:

[crsd(6051)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:13.223:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:14.614:

[crsd(6070)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:14.622:

[crsd(6070)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:15.253:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:16.643:

[crsd(6090)]CRS-1013:The OCR location in an ASM disk group is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:16.650:

[crsd(6090)]CRS-0804:Cluster Ready Service aborted due to Oracle Cluster Registry error [PROC-26: Error while accessing the physical storage

]. Details at (:CRSD00111:) in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:24:17.284:

[ohasd(144192)]CRS-2765:Resource ‘ora.crsd’ has failed on server ‘cxhisdb01’.

2021-05-26 13:24:17.284:

[ohasd(144192)]CRS-2771:Maximum restart attempts reached for resource ‘ora.crsd’; will not restart.

2021-05-26 13:24:17.315:

[ohasd(144192)]CRS-2769:Unable to failover resource ‘ora.crsd’.

4. 检查ASM日志

ASM日志中2021/05/26 12:19:57显示OCR仲裁盘有问题，13:23出现读写错误。

Wed May 26 12:19:57 2021

WARNING: Waited 15 secs for write IO to PST disk 0 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 2 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 3 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 4 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 0 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 2 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 3 in group 3.

WARNING: Waited 15 secs for write IO to PST disk 4 in group 3.

Wed May 26 12:19:57 2021

NOTE: process _b000_+asm1 (160488) initiating offline of disk 0.1409468596 (OCR_0000) with mask 0x7e in group 3

NOTE: process _b000_+asm1 (160488) initiating offline of disk 2.1409468594 (OCR_0002) with mask 0x7e in group 3

NOTE: process _b000_+asm1 (160488) initiating offline of disk 3.1409468595 (OCR_0003) with mask 0x7e in group 3

NOTE: process _b000_+asm1 (160488) initiating offline of disk 4.1409468592 (OCR_0004) with mask 0x7e in group 3

NOTE: checking PST: grp = 3

GMON checking disk modes for group 3 at 15 for pid 46, osid 160488

ERROR: no read quorum in group: required 3, found 1 disks

NOTE: checking PST for grp 3 done.

NOTE: initiating PST update: grp = 3, dsk = 0/0x5402c8b4, mask = 0x6a, op = clear

NOTE: initiating PST update: grp = 3, dsk = 2/0x5402c8b2, mask = 0x6a, op = clear

NOTE: initiating PST update: grp = 3, dsk = 3/0x5402c8b3, mask = 0x6a, op = clear

NOTE: initiating PST update: grp = 3, dsk = 4/0x5402c8b0, mask = 0x6a, op = clear

GMON updating disk modes for group 3 at 16 for pid 46, osid 160488

ERROR: no read quorum in group: required 3, found 1 disks

Wed May 26 12:19:57 2021

NOTE: cache dismounting (not clean) group 3/0xA242386E (OCR)

NOTE: messaging CKPT to quiesce pins Unix process pid: 160495, image: oracle@cxhisdb01 (B001)

Wed May 26 12:19:57 2021

NOTE: halting all I/Os to diskgroup 3 (OCR)

Wed May 26 12:19:57 2021

NOTE: LGWR doing non-clean dismount of group 3 (OCR)

NOTE: LGWR sync ABA=15.85 last written ABA 15.85

WARNING: Offline for disk OCR_0000 in mode 0x7f failed.

WARNING: Offline for disk OCR_0002 in mode 0x7f failed.

WARNING: Offline for disk OCR_0003 in mode 0x7f failed.

WARNING: Offline for disk OCR_0004 in mode 0x7f failed.

Wed May 26 12:19:58 2021

kjbdomdet send to inst 2

detach from dom 3, sending detach message to inst 2

Wed May 26 12:19:58 2021

NOTE: No asm libraries found in the system

Wed May 26 12:19:58 2021

List of instances:

1 2

Dirty detach reconfiguration started (new ddet inc 1, cluster inc 4)

Global Resource Directory partially frozen for dirty detach

* dirty detach – domain 3 invalid = TRUE

2 GCS resources traversed, 0 cancelled

Dirty Detach Reconfiguration complete

Wed May 26 12:19:58 2021

WARNING: dirty detached from domain 3

NOTE: cache dismounted group 3/0xA242386E (OCR)

2021-05-26 13:23:46.059:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

2021-05-26 13:23:46.068:

[crsd(145215)]CRS-1006:The OCR location +OCR is inaccessible. Details in /u01/app/11.2.0/grid/log/cxhisdb01/crsd/crsd.log.

问题处理步骤

由于数据库仅1节点OCR磁盘组访问异常，2节点以及其他盘访问正常，因而导致1节点CRS资源异常，且数据库未宕机，处理步骤需启动CRS资源，并且重启监听。

1. 1节点挂载OCR磁盘组

[root@cxhisdb01 ~]# su – grid

[grid@cxhisdb01 ~]$ sqlplus / as sysasm

SQL*Plus: Release 11.2.0.4.0 Production on Thu May 27 12:12:17 2021

Connected to:

Oracle Database 11g Enterprise Edition Release 11.2.0.4.0 – 64bit Production

With the Real Application Clusters and Automatic Storage Management options

SQL> alter diskgroup ocr mount;

Diskgroup altered.

SQL> exit

2. 启动CRS

[grid@cxhisdb01 ~]$ crsctl start res ora.crsd -init

CRS-2672: Attempting to start ‘ora.crsd’ on ‘cxhisdb01’

CRS-2676: Start of ‘ora.crsd’ on ‘cxhisdb01’ succeeded

3. 检查集群资源

[grid@cxhisdb01 ~]$ crsctl stat res -t

——————————————————————————–

NAME TARGET STATE SERVER STATE_DETAILS

——————————————————————————–

Local Resources

——————————————————————————–

ora.DATA.dg

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.LISTENER.lsnr

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.OCR.dg

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.SSD.dg

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.asm

ONLINE ONLINE cxhisdb01 Started

ONLINE ONLINE cxhisdb02 Started

ora.gsd

OFFLINE OFFLINE cxhisdb01

OFFLINE OFFLINE cxhisdb02

ora.net1.network

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.ons

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

ora.registry.acfs

ONLINE ONLINE cxhisdb01

ONLINE ONLINE cxhisdb02

——————————————————————————–

Cluster Resources

——————————————————————————–

ora.LISTENER_SCAN1.lsnr

1 ONLINE ONLINE cxhisdb02

ora.cvu

1 ONLINE ONLINE cxhisdb01

ora.cxhisdb01.vip

1 ONLINE ONLINE cxhisdb01

ora.cxhisdb02.vip

1 ONLINE ONLINE cxhisdb02

ora.hospital.db

1 ONLINE ONLINE cxhisdb01 Open

2 ONLINE ONLINE cxhisdb02 Open

ora.oc4j

1 ONLINE ONLINE cxhisdb01

ora.scan1.vip

1 ONLINE ONLINE cxhisdb02

4. 重启监听

[grid@cxhisdb01 ~]$ srvctl stop listener -n cxhisdb01

[grid@cxhisdb01 ~]$ srvctl start listener -n cxhisdb01

[grid@cxhisdb01 ~]$ lsnrctl status

LSNRCTL for Linux: Version 11.2.0.4.0 – Production on 27-MAY-2021 12:20:44

Connecting to (DESCRIPTION=(ADDRESS=(PROTOCOL=IPC)(KEY=LISTENER)))

STATUS of the LISTENER

————————

Alias LISTENER

Version TNSLSNR for Linux: Version 11.2.0.4.0 – Production

Start Date 27-MAY-2021 12:20:40

Uptime 0 days 0 hr. 0 min. 4 sec

Trace Level off

Security ON: Local OS Authentication

SNMP OFF

Listener Parameter File /u01/app/11.2.0/grid/network/admin/listener.ora

Listener Log File /u01/app/grid/diag/tnslsnr/cxhisdb01/listener/alert/log.xml

Listening Endpoints Summary…

(DESCRIPTION=(ADDRESS=(PROTOCOL=ipc)(KEY=LISTENER)))

(DESCRIPTION=(ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.80)(PORT=1521)))

(DESCRIPTION=(ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.10)(PORT=1521)))

Services Summary…

Service “hospital” has 1 instance(s).

Instance “hospital1”, status READY, has 1 handler(s) for this service…

Service “hospitalXDB” has 1 instance(s).

Instance “hospital1”, status READY, has 1 handler(s) for this service…

The command completed successfully

[grid@cxhisdb01 ~]$ cd /u01/app/grid/diag/tnslsnr/cxhisdb01/listener/trace/

[grid@cxhisdb01 trace]$ tail -f listener.log

27-MAY-2021 12:20:51 * (CONNECT_DATA=(CID=(PROGRAM=)(HOST=__jdbc__)(USER=))(SERVICE_NAME=hospital)(CID=(PROGRAM=)(HOST=__jdbc__)(USER=))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.171)(PORT=62045)) * establish * hospital * 0

27-MAY-2021 12:20:51 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=ZY-603300-02-YS)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.9.58)(PORT=3676)) * establish * hospital * 0

27-MAY-2021 12:20:55 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23431)) * establish * HOSPITAL * 0

27-MAY-2021 12:20:55 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=JZ-EK-001)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.3.99)(PORT=4442)) * establish * hospital * 0

27-MAY-2021 12:20:56 * (CONNECT_DATA=(SID=hospital1)(CID=(PROGRAM=配置数据库.exe)(HOST=YYJQZZJFW)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.244)(PORT=58397)) * establish * hospital1 * 0

27-MAY-2021 12:20:56 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23432)) * establish * HOSPITAL * 0

27-MAY-2021 12:20:56 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=ZY-603300-02-YS)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.9.58)(PORT=3681)) * establish * hospital * 0

27-MAY-2021 12:20:57 * service_update * hospital1 * 0

27-MAY-2021 12:20:58 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23438)) * establish * HOSPITAL * 0

27-MAY-2021 12:20:59 * (CONNECT_DATA=(SERVICE_NAME=hospital)(CID=(PROGRAM=e:\zjhis\电子病历PB9\emrproject.exe)(HOST=JZ-EK-001)(USER=his))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.3.99)(PORT=4460)) * establish * hospital * 0

27-MAY-2021 12:21:00 * service_update * hospital1 * 0

Thu May 27 12:21:01 2021

27-MAY-2021 12:21:01 * (CONNECT_DATA=(SID=hospital1)(CID=(PROGRAM=配置数据库.exe)(HOST=YYJQZZJFW)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.244)(PORT=58398)) * establish * hospital1 * 0

27-MAY-2021 12:21:03 * service_update * hospital1 * 0

27-MAY-2021 12:21:03 * (CONNECT_DATA=(SERVICE_NAME=HOSPITAL)(CID=(PROGRAM=oracle)(HOST=lis-server)(USER=Administrator))) * (ADDRESS=(PROTOCOL=tcp)(HOST=192.2.1.11)(PORT=23444)) * establish * HOSPITAL * 0

总结与后续处理建议

1. 问题总结

HIS数据库1节点访问OCR仲裁盘异常导致CRSD异常，进而引起1节点上多个集群资源offline，监听资源中相较于以前也少了192.2.1.80的IP，因此监控平台显示连接HIS数据库1节点异常，由于2节点和其他磁盘组无异常，因此数据库未发生宕机现象。

2. 处理操作

HIS数据库1节点手工mount OCR磁盘组
启动CRS资源
1节点重启监听资源

3. 后续建议

此次故障发生是由于集群资源访问OCR磁盘组异常导致，建议联系存储工程师，排查相关时间点存储运行情况，并定期做好存储巡检和状态监控工作。

Oracle voting disk 故障处理一例

概述

某医院客户于2021年11月5日上午9时左右出现HIS数据库连接异常的情况，在此情况下通过远程的方式接入数据库进行故障排除，快速解决了问题使得业务恢复正常使用。

问题及相关日志分析

1. 问题现状

前台业务反映HIS数据库时而连得上时而连不上，连不上时候报错ORA-12516，报错内容截图如下：

由于业务通过节点1的vip（192.2.1.10）连接数据库，测试节点1的vip连接，结果如下：

[oracle@cxhisdb01 ~]$ sqlplus a/b@192.2.1.10:1521/hospital

SQL*Plus: Release 11.2.0.4.0 Production on Fri Nov 5 09:45:15 2021

ERROR:

ORA-12516: TNS:listener could not find available handler with matching protocol

stack

2. 检查数据库alert日志

ORA-00020: maximum number of processes (4500) exceeded

ORA-20 errors will not be written to the alert log for

the next minute. Please look at trace files to see all

the ORA-20 errors.

Fri Nov 05 09:35:48 2021

Thread 1 advanced to log sequence 191580 (LGWR switch)

Current log# 1 seq# 191580 mem# 0: +SSD/hospital/onlinelog/redo01.log

Fri Nov 05 09:35:48 2021

LNS: Standby redo logfile selected for thread 1 sequence 191580 for destination LOG_ARCHIVE_DEST_4

Fri Nov 05 09:35:48 2021

LNS: Standby redo logfile selected for thread 1 sequence 191580 for destination LOG_ARCHIVE_DEST_3

Fri Nov 05 09:35:48 2021

LNS: Standby redo logfile selected for thread 1 sequence 191580 for destination LOG_ARCHIVE_DEST_2

Fri Nov 05 09:35:48 2021

Archived Log entry 904316 added for thread 1 sequence 191579 ID 0x431aef8a dest 1:

Fri Nov 05 09:37:30 2021

ORA-00020: maximum number of processes (4500) exceeded

ORA-20 errors will not be written to the alert log for

the next minute. Please look at trace files to see all

the ORA-20 errors.

alert中出现大量的ORA-00020错误告警，数据库连接数达到4500上限。

3. 检查数据库参数和连接

SQL> show parameter process;

NAME TYPE VALUE

———————————— ———– ——————————

aq_tm_processes integer 1

cell_offload_processing boolean TRUE

db_writer_processes integer 10

gcs_server_processes integer 4

global_txn_processes integer 1

job_queue_processes integer 1000

log_archive_max_processes integer 4

processes integer 4500

processor_group_name string

SQL> select inst_id,count(*) from gv$session group by inst_id;

INST_ID COUNT(*)

———- ———-

1 4496

2 715

查询得知，1节点连接数达到4496，已经达到连接数4500的上限，因此前端业务出现连接不上的情况。

4. 定位异常连接

SQL> select program,machine,count(*)

2 from gv$session group by program,machine,sql_id order by 3;

…

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

w3wp.exe WORKGROUP\SXZZ_SERVER_CX 10

Medinfo.Quartz.DbJob.exe WORKGROUP\QUANGUO-NB 10

yszj.exe WORKGROUP\MZ-3F-EBHYS7 10

yszj.exe WORKGROUP\MZ-2F-NK2YS4 10

yszj.exe WORKGROUP\MZ1F-EKYS7 10

emrproject.exe WORKGROUP\ZY1-2X-YS2 10

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-2F-WKYS9 11

yszj.exe WORKGROUP\MZ-2F-FKYS13 11

emrjz.exe WORKGROUP\JZ-QJS-HS1 11

yzxt.exe WORKGROUP\ZY-604200-01-HS 11

yszj.exe WORKGROUP\MZ-2F-GKYS3 12

yszj.exe WORKGROUP\MZ-3F-KQYS6 12

yzxt.exe WORKGROUP\ZY-609400-02-HS 12

HisServer.exe WORKGROUP\YYGHSERV 12

yszj.exe WORKGROUP\GRK-GBMZ-01 12

oracle@lis-server (TNS V1-V3) lis-server 12

yszj.exe WORKGROUP\MZ-2F-WKYS10 13

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-1F-EKYS6 13

yszj.exe WORKGROUP\MZ-NKJ-01 13

yzxt.exe WORKGROUP\ZY-ZHNK1-HS-04 13

yszj.exe WORKGROUP\MZ-3F-YKYS2 13

yszj.exe WORKGROUP\M3-YK-YS1 14

yszj.exe WORKGROUP\MZ-2F-FKYS-01 14

yszj.exe WORKGROUP\MZ-3F-YKYS4 14

yszj.exe WORKGROUP\MZ-2F-FKYS10 14

yszj.exe WORKGROUP\MZ-3F-TJWKYS1 15

yszj.exe WORKGROUP\MZ-3F-WKYS5 15

MAF.exe WORKGROUP\QUYUSC1 16

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-3F-WKYS3 16

yzxt.exe WORKGROUP\HIS-CXYY 16

yszj.exe WORKGROUP\MZ-2F-NK2YS05 16

yszj.exe WORKGROUP\MZ-2F-WKYS5 17

yszj.exe WORKGROUP\MZ-3F-WKYS4 17

yszj.exe WORKGROUP\MZ-3F-KQKYS2 17

w3wp.exe WORKGROUP\WX_DDJY 17

MediII.Adapter.WinForm.Scanner_V WORKGROUP\YYQZ 18

yszj.exe WORKGROUP\MZ-1F-EKYS4 18

yszj.exe WORKGROUP\MZ-2F-NK2YS6 18

emrproject.exe WORKGROUP\3020-CXYY 19

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-2F-GK24 19

yszj.exe WORKGROUP\ZY-609400-02-HS 19

yszj.exe WORKGROUP\MZ-GK-25 20

emrproject.exe WORKGROUP\HIS-PC 20

yszj.exe WORKGROUP\ZY-609400-02-YS 21

yszj.exe WORKGROUP\MZ-GUKE-YS04 21

yszj.exe WORKGROUP\MZ-2F-FKYS9 22

yszj.exe MSHOME\JZ-WKYS2 22

yszj.exe WORKGROUP\MZ-2F-NK2YS-01 22

w3wp.exe WORKGROUP\YINYIQIANZHI1 23

yszj.exe WORKGROUP\MZ-2F-WKYS4 23

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-2F-GKYS2 23

yszj.exe WORKGROUP\MZ-2F-FKYS6 24

yszj.exe WORKGROUP\MZ-2F-WKYS6 25

yszj.exe WORKGROUP\MZ-XEK7 25

yszj.exe WORKGROUP\MZ-3F-EBYHYS2 25

yszj.exe WORKGROUP\MZ-1F-EKYS8 26

yszj.exe WORKGROUP\MZ-3F-ZYYS-8 26

yszj.exe WORKGROUP\MZ-2F-FKYS3 27

yszj.exe WORKGROUP\MZ-2F-WKYS2 27

yszj.exe WORKGROUP\MZ-2F-WKYS3 28

yszj.exe WORKGROUP\MZ-2F-NK2YS2 28

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-2F-NK2-YS09 29

oracle@cxemrdb01 (TNS V1-V3) cxemrdb01 30

yszj.exe WORKGROUP\MZ-2F-NK2-9 30

yszj.exe WORKGROUP\MZ-3F-ZYYS2 30

yszj.exe WORKGROUP\MZ-3F-EBHYS6 30

yszj.exe WORKGROUP\MZ-2F-GKYS21 31

yszj.exe WORKGROUP\MZ-WAIKE-YS12 33

yzxt.exe WORKGROUP\ZY-7D-HS3 33

yszj.exe WORKGROUP\MZ-2F-FKYS5 34

yszj.exe WORKGROUP\MZ-2F-NK1YS3 34

yszj.exe WORKGROUP\DESKTOP-VI7O6SC 36

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-2F-WKYS1 38

yszj.exe WORKGROUP\MZ-NK2-07 39

yszj.exe WORKGROUP\MZ-2F-FKYS11 42

yszj.exe WORKGROUP\MZ-2F-NK1YS5 43

yszj.exe WORKGROUP\JZ-EK-001 45

yszj.exe WORKGROUP\LHMZ_YS2 46

yszj.exe WORKGROUP\MZ-2F-NK1YS2 50

w3wp.exe WORKGROUP\HIS-YIYAO 50

yszj.exe WORKGROUP\MZ-2F-WKYS11 56

yszj.exe WORKGROUP\LHMZ_YS1 56

yszj.exe WORKGROUP\MZ-2F-NK1YS14 58

PROGRAM MACHINE COUNT(*)

——————————– ————————– ———-

yszj.exe WORKGROUP\MZ-2F-NK1YS13 62

yszj.exe WORKGROUP\JZ-NK-2 64

yszj.exe WORKGROUP\MZ-2F-WKYS007 66

yszj.exe WORKGROUP\MZ-2F-FKYS4 66

yszj.exe WORKGROUP\MZ-2F-NKYS7 69

yszj.exe WORKGROUP\MZ-2F-NK1YS4 103

yszj.exe WORKGROUP\MZ-PAOZHEN-01 108

oracle@lis-server (TNS V1-V3) lis-server 136

oracle@cxemrdb01 (TNS V1-V3) cxemrdb01 395

查询得知，yszj.exe应用程序对HIS数据库建立了较高数量的连接，其中主机名为“MZ-2F”开头的客户端尤为明显。

三、问题处理步骤

1. kill session

查询yszj.exe应用程序的进程ID

SQL> select b.spid

2 from v$session a,v$process b

3 where a.paddr=b.addr

4 and a.program=’yszj.exe’;

5 and a.machine like ‘MZ-2F%’;

SPID

————————

…

70593

72017

76189

77305

78409

78481

78636

78676

79541

80183

80816

…

杀会话

…

[oracle@cxhisdb01 ~]$ kill -9 70593

[oracle@cxhisdb01 ~]$ kill -9 72017

[oracle@cxhisdb01 ~]$ kill -9 76189

[oracle@cxhisdb01 ~]$ kill -9 77305

[oracle@cxhisdb01 ~]$ kill -9 78409

[oracle@cxhisdb01 ~]$ kill -9 78481

[oracle@cxhisdb01 ~]$ kill -9 78636

[oracle@cxhisdb01 ~]$ kill -9 78676

[oracle@cxhisdb01 ~]$ kill -9 79541

[oracle@cxhisdb01 ~]$ kill -9 80183

[oracle@cxhisdb01 ~]$ kill -9 80816

…

执行kill session后，效果不明显，因为会话查杀后，又有新的会话连接进来。

2. 客户沟通

通过与客户沟通了解到，YZSJ.EXE程序新上了检查报告互认功能，会导致程序连接不释放问题，导致程序到数据库连接数不断增加，从而数据库连接数达到上限。

3. 释放进程

客户安排前台业务程序主机重启，释放连接进程，业务端主机重启后，检查连接情况如下：

SQL> select inst_id,count(*) from gv$session group by inst_id;

INST_ID COUNT(*)

———- ———-

1 3427

2 681

SQL> select program,machine,count(*)

2 from gv$session where program=’yszj.exe’ group by program,machine order by 3;

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\XZ-YGK-07 1

yszj.exe WORKGROUP\YJ-NKJ-MZFT 1

yszj.exe WORKGROUP\MZ-3F-WKYS4 2

yszj.exe WORKGROUP\RLS 2

yszj.exe WORKGROUP\MZ-3F-KQKYS2 2

yszj.exe WORKGROUP\MZ-2F-GKYS3 2

yszj.exe WORKGROUP\MZ-FWT-01 2

yszj.exe WORKGROUP\FDHJ54365 2

yszj.exe WORKGROUP\MZ-2F-WKYS2 2

yszj.exe WORKGROUP\MZ-3F-KQK7 2

yszj.exe WORKGROUP\MZ-2F-WKYS9 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-NK1-YS6 2

yszj.exe WORKGROUP\MZ-2F-FKYS9 2

yszj.exe WORKGROUP\MZ-2F-GKYS2 2

yszj.exe WORKGROUP\MZ-2F-WKYS6 2

yszj.exe WORKGROUP\CXYY-YZ2 2

yszj.exe WORKGROUP\MZ-2F-FKYS13 2

yszj.exe WORKGROUP\MZ-KQK-YS-1 2

yszj.exe WORKGROUP\MZ-2F-NK1YS13 2

yszj.exe WORKGROUP\ZY-XYJHZX-01-YS 2

yszj.exe WORKGROUP\MZ-2F-WKYS8 2

yszj.exe WORKGROUP\MZ-2F-FKYS4 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-NK1YS4 2

yszj.exe WORKGROUP\MZ-2F-WKYS3 2

yszj.exe WORKGROUP\MZ-2F-NKYS7 2

yszj.exe WORKGROUP\MZ-3F-WKYS5 2

yszj.exe WORKGROUP\MZ-2F-WKYS007 2

yszj.exe WORKGROUP\GAMZ-01 2

yszj.exe WORKGROUP\MZ-WJ3 2

yszj.exe WORKGROUP\MZ-3F-PFYS3 2

yszj.exe WORKGROUP\ZY-609400-03-HS 2

yszj.exe WORKGROUP\MZ-2F-WKYS1 2

yszj.exe WORKGROUP\ZY1-4X-YS5 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-FKYS7 2

yszj.exe WORKGROUP\MZ-3F-TJWKYS1 2

yszj.exe WORKGROUP\ZY-601700-05-YS 2

yszj.exe WORKGROUP\MZ-PICC-02 2

yszj.exe WORKGROUP\JZ-EK-001 2

yszj.exe WORKGROUP\FSK-SSZX-02 2

yszj.exe WORKGROUP\ZY1-5D-YS4 2

yszj.exe WORKGROUP\MZ-3F-KFYJT 2

yszj.exe WORKGROUP\ZY-XSR-YS5 2

yszj.exe WORKGROUP\MZ-ZY-YS9 2

yszj.exe WORKGROUP\MZ-3F-EBYHYS2 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-FKYS15 2

yszj.exe WORKGROUP\MZ-2F-FK12 2

yszj.exe WORKGROUP\MZ-2F-NK1YS5 2

yszj.exe WORKGROUP\MZ-WK-SKZK 2

yszj.exe WORKGROUP\MZ-2F-NK1YS-17 2

yszj.exe WORKGROUP\MZ-2F-FKYS10 2

yszj.exe WORKGROUP\MZ-2F-GK24 2

yszj.exe WORKGROUP\MZ-2F-NK2YS10 2

yszj.exe WORKGROUP\MZ-KOUQIANG-Y11 2

yszj.exe WORKGROUP\JZ-EK-YS-06 2

yszj.exe WORKGROUP\MZ-NKJ-01 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-NK2-9 2

yszj.exe WORKGROUP\MZ-3F-ZYYS2 2

yszj.exe WORKGROUP\MZ-3F-EBHYS6 2

yszj.exe MSHOME\JZ-QJ-04 2

yszj.exe WORKGROUP\MZ-3F-YKYS2 2

yszj.exe WORKGROUP\MZ-3F-ZY-YS3- 2

yszj.exe WORKGROUP\MZ-2F-NK1YS2 2

yszj.exe WORKGROUP\MZ-KOUQIANG-Y13 2

yszj.exe WORKGROUP\MZ-3F-ZYYS10 2

yszj.exe WORKGROUP\JZ-EBH-01 2

yszj.exe WORKGROUP\JZ-GK-YS1 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-NK1-YS-017 2

yszj.exe WORKGROUP\MZ1F-EKYS7 2

yszj.exe WORKGROUP\JZ-NK-2 2

yszj.exe WORKGROUP\MZ-2F-FKYS11 2

yszj.exe WORKGROUP\MZ-2F-FKYS6 2

yszj.exe WORKGROUP\MZ-2F-FKYS-01 2

yszj.exe WORKGROUP\MZ-2F-NK1YS3 2

yszj.exe WORKGROUP\ZY2-11F-YS2 2

yszj.exe WORKGROUP\MZ-WAIKE-YS12 2

yszj.exe WORKGROUP\MZ-2F-NK2-YS09 2

yszj.exe WORKGROUP\MZ-FWT-03 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-NK2YS6 2

yszj.exe WORKGROUP\MZ-1F-EKYS6 2

yszj.exe WORKGROUP\JZ_XIONGTONG 2

yszj.exe WORKGROUP\MZ-3F-KQYS6 2

yszj.exe WORKGROUP\MZ-3F-YK13 2

yszj.exe WORKGROUP\ZY-TXBQ-YYK 2

yszj.exe WORKGROUP\LXZL-205 2

yszj.exe WORKGROUP\MZ-NK-11-YS 2

yszj.exe WORKGROUP\MZ-3F-PFK-2 2

yszj.exe WORKGROUP\MZ-NK-YS8 2

yszj.exe WORKGROUP\MZ-2F-NK2YS2 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-3F-KQKYS4 2

yszj.exe WORKGROUP\MZ-2F-NK2YS-01 2

yszj.exe WORKGROUP\MZ-2F-NK1YS14 2

yszj.exe WORKGROUP\MZ-3F-ZYYS1 2

yszj.exe WORKGROUP\FRMZ-YS-03 2

yszj.exe WORKGROUP\MZ-NKZJ-01 2

yszj.exe WORKGROUP\MZ-2F-NK2YS3 2

yszj.exe WORKGROUP\MZ-2F-NK2YS05 2

yszj.exe WORKGROUP\MZ-XEK7 2

yszj.exe WORKGROUP\MZ-ZJNK-YS8-A 2

yszj.exe WORKGROUP\MZ-ZHFW-05 2

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-2F-FKYS3 2

yszj.exe WORKGROUP\ZY2-6F-ZLNKYS01 2

yszj.exe WORKGROUP\MZ-FWT-04 2

yszj.exe WORKGROUP\FRMZ_YS2 2

yszj.exe WORKGROUP\MZ-3F-WKYS3 2

yszj.exe WORKGROUP\JZ-QJS-YS3 2

yszj.exe WORKGROUP\ZY-703700-05-YS 2

yszj.exe WORKGROUP\TSZX-02 3

yszj.exe WORKGROUP\MZ-3F-KQXFS1 3

yszj.exe WORKGROUP\DESKTOP-VI7O6SC 3

yszj.exe WORKGROUP\ZY-600400-06-YS 3

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\FSK-DPS-08 4

yszj.exe WORKGROUP\MZ-3F-ZYYS5 4

yszj.exe MSHOME\JZ-WKYS01 4

yszj.exe WORKGROUP\SF-YWS-YS1 5

yszj.exe WORKGROUP\MZ-3F-ZYYS4 5

yszj.exe WORKGROUP\ZY1-1X-YS 5

yszj.exe WorkGroup\USER-2021SXCVV 5

yszj.exe WORKGROUP\MZ2-NJZX-001 5

yszj.exe WORKGROUP\ZY-GK3-HS4 6

yszj.exe WORKGROUP\MZ-GB-CDMZ 6

yszj.exe WORKGROUP\MZ-2F-NK2YS11 8

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-3L-ZYNK-007 9

yszj.exe WORKGROUP\MZ-3F-ZJK1 11

yszj.exe WORKGROUP\MZ-3F-KQKYS5 14

yszj.exe WORKGROUP\HIS-PC 15

yszj.exe WORKGROUP\MZ-2F-WKYS10 16

yszj.exe WORKGROUP\GRK-GBMZ-01 16

yszj.exe WORKGROUP\MZ-PIFU-YS3 18

yszj.exe WORKGROUP\ZY-609400-02-YS 21

yszj.exe WORKGROUP\MZ-3F-KQKYS3 22

yszj.exe WORKGROUP\MZ-PAOZHEN-01 23

yszj.exe WORKGROUP\MZ-GUKE-YS04 23

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\MZ-FR-EK-1 24

yszj.exe WORKGROUP\ZY-609400-02-HS 25

yszj.exe WORKGROUP\MZ-3F-PFKYS24 25

yszj.exe WORKGROUP\MZ-2F-WKYS5 34

yszj.exe WORKGROUP\MZ-3F-YKYS4 34

yszj.exe WORKGROUP\MZ-2F-WKYS4 35

yszj.exe WORKGROUP\MZ-1F-EKYS8 35

yszj.exe MSHOME\JZ-WKYS2 35

yszj.exe WORKGROUP\MZ-3F-ZYYS-8 37

yszj.exe WORKGROUP\MZ-1F-EKYS4 45

yszj.exe WORKGROUP\LHMZ_YS2 64

PROGRAM MACHINE COUNT(*)

————— —————————— ———-

yszj.exe WORKGROUP\LHMZ_YS1 78

yszj.exe WORKGROUP\MZ-2F-WKYS11 87

145 rows selected.

通过查询得知，前端业务主机重启后，释放了一部分进程，节点1连接数已经下降至3400+，前端业务连接正常，未发现ORA-12516错误。

总结与后续处理建议

1. 问题总结

YZSJ.EXE应用程序通过192.2.1.10（节点1的VIP）连接HIS数据库，由于YZSJ.EXE程序新上了检查报告互认功能，有程序连接不释放问题，导致程序到数据库连接数不断增加，数据库连接数（4500）达到上限，从而导致前端业务连接数据库失败，出现ORA-12516:TNS:监听程序找不到符合协议堆栈要求的可用处理程序。

2. 处理操作

查杀会话，释放进程资源 ==>效果不明显，会话杀掉后程序会自动重连
重启业务前端主机，释放资源 ==>暂时缓解，数据库连接数已经下降至3400+
修复exe应用程序 ==>彻底解决，预计中午业务研发部门对yszj.exe做修复

3. 后续建议

此次故障发生的根本原因是由于YZSJ.EXE程序更新了检查报告互认功能导致的程序连接不释放导致数据库连接数达到上限，从而导致前端业务无法连接数据库，针对此次发生的故障，建议点如下：

业务测试，业务更新上线前，在开发库或测试环境做充分的业务功能性测试和性能测试；
调整数据库连接数参数，当前数据库的process参数为4500，可根据实际需要适当调整增大数据库的process参数，调整此参数需要重启数据库生效，因此需要安排停机时间。

Oracle 连接异常处理故障一例

归档日志间隙（Archive Gap）

解决日志间隙的方式

自动日志间隙解决方案

FAL (Fetch Archive Log)日志间隙解决方案

手工解决日志间隙

使用增量备份前滚（仅适用于物理 Standby）

概述

集群及数据库日志分析

1. 15:36分主机重启前的错误信息

2. 节点2重启主机后相关日志信息

3. 问题分析描述

三、总结与后续处理建议

1. 问题总结

4. 后续对于集群添加磁盘的建议

概述

问题及相关日志分析

1. 检查监听

2. 检查集群状态

3. 检查集群日志

4. 检查ASM日志

问题处理步骤

1. 1节点挂载OCR磁盘组

2. 启动CRS

3. 检查集群资源

4. 重启监听

总结与后续处理建议

1. 问题总结

2. 处理操作

3. 后续建议

概述

问题及相关日志分析

1. 问题现状

2. 检查数据库alert日志

3. 检查数据库参数和连接

4. 定位异常连接

三、问题处理步骤

1. kill session

2. 客户沟通

3. 释放进程

总结与后续处理建议

1. 问题总结

2. 处理操作

3. 后续建议

Oracle 恢复工具 Mdata 5.0.1 版本发布

近期文章

分类目录

扫码关注微信公众号:Oracle运维那些事 获取定期发布的数据库运维的有趣事情!

近期活动

扫码关注微信公众号:Oracle运维那些事获取定期发布的数据库运维的有趣事情!